Doublet inference was done using two packages : Scrublet and DoubletDetection. These processing steps load the each of the 35 datasets, performs low quality cells filtering then infer the presence of doublets. The cells inferred status (doublet or singlet) is stored in the metadata of each dataset. The datasets are then normalized to 10000 UMIs and aggregated to enabled a first look analysis of the doublet inference results.
import numpy as np
import pandas as pd
import scanpy as sc
import os
import scrublet as scr
import doubletdetection
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
scanpy==1.4+39.gc70f24b anndata==0.6.18 numpy==1.16.2 scipy==1.2.1 pandas==0.24.1 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1
def remove_RB_genes(
df,
path_to_RB_genes_file = '/home/deprez/HCA/PeerLab_analysis/RB_genes'
):
"""Removes all columns of RB genes as listed in the RB gene file.
RB_genes_file should contain gene names, one gene name per line.
Returns RB gene-depleted df, pd.Series with number of counts removed
per cell, and a list of RB genes that were in the df."""
with open(path_to_RB_genes_file,'r') as file:
lines = file.readlines()
genes = [x.rstrip('\n') for x in lines]
RB_genes_in_df = []
df_genes = df.columns
for gene in genes:
if gene in df_genes:
RB_genes_in_df.append(gene)
# df_genes = df.columns
# ribosomal_genes = []
# for GENE in df_genes:
# if GENE[:3] in ['RPL','RPS']:
# ribosomal_genes.append(GENE)
# store the number of RB molecules per cell in a Series object with cell labels as indices
counts_removed_per_cell = pd.Series(index = df.index)
for cell in df.index:
counts_removed_per_cell[cell] = sum(df.loc[cell][RB_genes_in_df])
# now drop all columns with RB genes:
df_RB_depleted = df.drop(columns = RB_genes_in_df)
return df_RB_depleted, counts_removed_per_cell, RB_genes_in_df
sc.settings.set_figure_params(dpi=80)
os.chdir('/home/deprez/HCA/Data/')
outsPath = 'outs/filtered_gene_bc_matrices/ucagenomix-cellranger-hg19-1.3.0/'
D322_Biop_Nas1 = sc.read_10x_mtx(
'./D322_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Nas1.var_names_make_unique()
D322_Biop_Nas1.obs['manip'] = 'D322_Biop_Nas1'
D322_Biop_Nas1.obs['position'] = 'Nasal'
D322_Biop_Nas1.obs['method'] = 'Biopsy'
D322_Biop_Nas1.obs['donor'] = 'D322'
D322_Biop_Nas1.obs['name'] = ['D322_Biop_Nas1_' + s for s in list(D322_Biop_Nas1.obs.index)]
D322_Biop_Nas1.obs_names = D322_Biop_Nas1.obs['name']
D322_Biop_Nas1
... reading from cache file ./cache/D322_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1797 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D322_Biop_Nas1, n_top=20)
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=0)
mito_genes = D322_Biop_Nas1.var_names.str.startswith('MT-')
D322_Biop_Nas1.obs['percent_mito'] = np.sum(
D322_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.obs['n_counts'] = D322_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Nas1.to_df())
ribo_genes = D322_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Nas1.obs['percent_ribo'] = np.sum(
D322_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=500)
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['n_counts'] < 40000, :]
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['percent_mito'] < 0.2, :]
filtered out 10 cells that have less than 500 genes expressed
D322_Biop_Nas1.shape
(1780, 32739)
# scrublet
scrub = scr.Scrublet(D322_Biop_Nas1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D322_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram();
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.17 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 9.6% Overall doublet rate: Expected = 1.6% Estimated = 4.7% Elapsed time: 1.4 seconds
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Nas1.X).predict()
D322_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4085679054260254 seconds Jaccard graph constructed in 0.3359701633453369 seconds Wrote graph to binary file in 0.1134796142578125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851025 After 2 runs, maximum modularity is Q = 0.852223 Louvain completed 22 runs in 0.776637077331543 seconds PhenoGraph complete in 1.6490111351013184 seconds Found communities [-1, ... 12], with sizes: [132, 489, 246, 245, 199, 178, 157, 146, 131, 84, 74, 66, 59, 19] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40769004821777344 seconds Jaccard graph constructed in 0.29503822326660156 seconds Wrote graph to binary file in 0.03574967384338379 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.85688 Louvain completed 21 runs in 0.6499230861663818 seconds PhenoGraph complete in 1.4009625911712646 seconds Found communities [-1, ... 11], with sizes: [148, 434, 306, 261, 245, 187, 150, 129, 113, 89, 78, 55, 30] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30634331703186035 seconds Jaccard graph constructed in 0.3103601932525635 seconds Wrote graph to binary file in 0.034491539001464844 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851977 After 3 runs, maximum modularity is Q = 0.854194 Louvain completed 23 runs in 0.8237855434417725 seconds PhenoGraph complete in 1.4866650104522705 seconds Found communities [-1, ... 13], with sizes: [141, 324, 273, 238, 231, 172, 159, 143, 116, 103, 82, 77, 76, 49, 41] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40821146965026855 seconds Jaccard graph constructed in 0.30414676666259766 seconds Wrote graph to binary file in 0.11415791511535645 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.855464 Louvain completed 21 runs in 0.6677343845367432 seconds PhenoGraph complete in 1.504737138748169 seconds Found communities [-1, ... 14], with sizes: [147, 339, 219, 203, 203, 203, 192, 161, 150, 117, 82, 73, 54, 35, 28, 19] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30776405334472656 seconds Jaccard graph constructed in 0.3069312572479248 seconds Wrote graph to binary file in 0.03258657455444336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.852832 Louvain completed 21 runs in 0.6703841686248779 seconds PhenoGraph complete in 1.327599287033081 seconds Found communities [-1, ... 15], with sizes: [150, 420, 231, 225, 222, 175, 171, 114, 109, 91, 90, 77, 45, 39, 39, 16, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30767202377319336 seconds Jaccard graph constructed in 0.2992825508117676 seconds Wrote graph to binary file in 0.12420010566711426 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.856174 Louvain completed 21 runs in 0.6723473072052002 seconds PhenoGraph complete in 1.4187166690826416 seconds Found communities [-1, ... 13], with sizes: [136, 400, 269, 264, 224, 192, 163, 154, 137, 97, 79, 40, 36, 21, 13] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3098146915435791 seconds Jaccard graph constructed in 0.3055295944213867 seconds Wrote graph to binary file in 0.035646915435791016 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.854862 Louvain completed 21 runs in 0.6756174564361572 seconds PhenoGraph complete in 1.337935209274292 seconds Found communities [-1, ... 13], with sizes: [144, 291, 230, 223, 220, 210, 173, 172, 160, 102, 82, 75, 69, 38, 36] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2076280117034912 seconds Jaccard graph constructed in 0.29665303230285645 seconds Wrote graph to binary file in 0.03459882736206055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.855011 After 2 runs, maximum modularity is Q = 0.856402 After 5 runs, maximum modularity is Q = 0.857599 After 20 runs, maximum modularity is Q = 0.859197 Louvain completed 40 runs in 1.4657461643218994 seconds PhenoGraph complete in 2.0164525508880615 seconds Found communities [-1, ... 14], with sizes: [181, 330, 210, 207, 178, 165, 162, 161, 154, 132, 96, 78, 60, 52, 47, 12] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20734572410583496 seconds Jaccard graph constructed in 0.290557861328125 seconds Wrote graph to binary file in 0.13068079948425293 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.857174 Louvain completed 21 runs in 0.6959750652313232 seconds PhenoGraph complete in 1.3394224643707275 seconds Found communities [-1, ... 13], with sizes: [113, 438, 256, 199, 192, 175, 141, 139, 133, 133, 91, 82, 80, 42, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31047511100769043 seconds Jaccard graph constructed in 0.3088827133178711 seconds Wrote graph to binary file in 0.03481912612915039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.852402 Louvain completed 21 runs in 0.6792612075805664 seconds PhenoGraph complete in 1.34515380859375 seconds Found communities [-1, ... 12], with sizes: [143, 408, 229, 224, 223, 212, 194, 155, 129, 96, 77, 56, 43, 36] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3071448802947998 seconds Jaccard graph constructed in 0.3109931945800781 seconds Wrote graph to binary file in 0.13086557388305664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.85427 Louvain completed 21 runs in 0.6788380146026611 seconds PhenoGraph complete in 1.4446029663085938 seconds Found communities [-1, ... 14], with sizes: [133, 362, 242, 228, 204, 202, 193, 147, 131, 96, 81, 74, 50, 49, 21, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20951056480407715 seconds Jaccard graph constructed in 0.2941608428955078 seconds Wrote graph to binary file in 0.03218817710876465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.858102 After 4 runs, maximum modularity is Q = 0.859675 Louvain completed 24 runs in 0.8347189426422119 seconds PhenoGraph complete in 1.3833091259002686 seconds Found communities [-1, ... 15], with sizes: [138, 482, 236, 220, 182, 171, 134, 131, 130, 94, 84, 80, 57, 33, 24, 17, 12] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20773720741271973 seconds Jaccard graph constructed in 0.27033162117004395 seconds Wrote graph to binary file in 0.03422141075134277 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.854994 After 3 runs, maximum modularity is Q = 0.856285 Louvain completed 23 runs in 0.8078761100769043 seconds PhenoGraph complete in 1.3342747688293457 seconds Found communities [-1, ... 12], with sizes: [142, 346, 250, 238, 195, 192, 181, 172, 132, 110, 98, 80, 45, 44] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3062746524810791 seconds Jaccard graph constructed in 0.3097362518310547 seconds Wrote graph to binary file in 0.12377023696899414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.854888 After 2 runs, maximum modularity is Q = 0.856021 Louvain completed 22 runs in 0.803412675857544 seconds PhenoGraph complete in 1.5556888580322266 seconds Found communities [-1, ... 12], with sizes: [141, 446, 245, 230, 210, 189, 134, 129, 125, 104, 86, 77, 74, 35] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3096494674682617 seconds Jaccard graph constructed in 0.2952749729156494 seconds Wrote graph to binary file in 0.0346531867980957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851256 After 3 runs, maximum modularity is Q = 0.853721 Louvain completed 23 runs in 0.8304622173309326 seconds PhenoGraph complete in 1.4832394123077393 seconds Found communities [-1, ... 15], with sizes: [124, 421, 242, 234, 180, 164, 148, 143, 130, 87, 83, 76, 67, 67, 26, 19, 14] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3072354793548584 seconds Jaccard graph constructed in 0.2686634063720703 seconds Wrote graph to binary file in 0.03485536575317383 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.852857 After 2 runs, maximum modularity is Q = 0.854892 Louvain completed 22 runs in 0.7863106727600098 seconds PhenoGraph complete in 1.421738624572754 seconds Found communities [-1, ... 15], with sizes: [143, 455, 230, 228, 189, 162, 124, 113, 104, 99, 82, 81, 76, 52, 42, 25, 20] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30680036544799805 seconds Jaccard graph constructed in 0.41535520553588867 seconds Wrote graph to binary file in 0.034188270568847656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.849737 After 14 runs, maximum modularity is Q = 0.850778 Louvain completed 34 runs in 1.1089999675750732 seconds PhenoGraph complete in 1.8784644603729248 seconds Found communities [-1, ... 15], with sizes: [143, 434, 237, 233, 226, 163, 155, 153, 139, 102, 72, 51, 39, 37, 17, 12, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3088347911834717 seconds Jaccard graph constructed in 0.2695302963256836 seconds Wrote graph to binary file in 0.03755521774291992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.857925 Louvain completed 21 runs in 0.7062399387359619 seconds PhenoGraph complete in 1.3346896171569824 seconds Found communities [-1, ... 14], with sizes: [133, 278, 223, 214, 187, 186, 172, 169, 132, 115, 85, 84, 74, 71, 68, 34] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3076784610748291 seconds Jaccard graph constructed in 0.30925512313842773 seconds Wrote graph to binary file in 0.1296396255493164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851976 Louvain completed 21 runs in 0.6892428398132324 seconds PhenoGraph complete in 1.448503017425537 seconds Found communities [-1, ... 14], with sizes: [106, 361, 236, 223, 187, 151, 148, 122, 115, 110, 103, 102, 79, 77, 66, 39] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30797600746154785 seconds Jaccard graph constructed in 0.27184224128723145 seconds Wrote graph to binary file in 0.03627347946166992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.852469 Louvain completed 21 runs in 0.6978359222412109 seconds PhenoGraph complete in 1.3291597366333008 seconds Found communities [-1, ... 16], with sizes: [151, 373, 209, 191, 189, 178, 166, 141, 125, 97, 96, 79, 61, 56, 51, 36, 13, 13] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.307117223739624 seconds Jaccard graph constructed in 0.2681887149810791 seconds Wrote graph to binary file in 0.03509783744812012 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.856575 After 4 runs, maximum modularity is Q = 0.857611 Louvain completed 24 runs in 0.8502638339996338 seconds PhenoGraph complete in 1.4721050262451172 seconds Found communities [-1, ... 16], with sizes: [153, 465, 260, 226, 177, 157, 152, 152, 132, 83, 79, 58, 38, 35, 22, 13, 12, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30603790283203125 seconds Jaccard graph constructed in 0.31660890579223633 seconds Wrote graph to binary file in 0.12119674682617188 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.854788 After 13 runs, maximum modularity is Q = 0.85586 Louvain completed 33 runs in 1.109736442565918 seconds PhenoGraph complete in 1.8680367469787598 seconds Found communities [-1, ... 14], with sizes: [138, 432, 211, 211, 192, 146, 145, 142, 122, 117, 105, 76, 59, 57, 54, 18] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3064150810241699 seconds Jaccard graph constructed in 0.296893835067749 seconds Wrote graph to binary file in 0.0340418815612793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.850866 After 2 runs, maximum modularity is Q = 0.854173 Louvain completed 22 runs in 0.7796761989593506 seconds PhenoGraph complete in 1.4281444549560547 seconds Found communities [-1, ... 13], with sizes: [137, 449, 233, 222, 216, 198, 149, 129, 99, 98, 96, 84, 53, 51, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3071146011352539 seconds Jaccard graph constructed in 0.3070831298828125 seconds Wrote graph to binary file in 0.12588262557983398 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.857457 Louvain completed 21 runs in 0.6961038112640381 seconds PhenoGraph complete in 1.448686122894287 seconds Found communities [-1, ... 13], with sizes: [149, 448, 239, 223, 201, 191, 133, 130, 119, 115, 82, 78, 66, 34, 17] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3065071105957031 seconds Jaccard graph constructed in 0.27219343185424805 seconds Wrote graph to binary file in 0.03264212608337402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.853015 After 2 runs, maximum modularity is Q = 0.854128 After 3 runs, maximum modularity is Q = 0.855507 Louvain completed 23 runs in 0.927466869354248 seconds PhenoGraph complete in 1.5605666637420654 seconds Found communities [-1, ... 12], with sizes: [119, 342, 313, 245, 236, 234, 166, 145, 103, 99, 76, 60, 48, 39]
sc.pp.normalize_per_cell(D322_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Nas1) # log transform the data
D322_Biop_Nas1.raw = D322_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D322_Biop_Nas1 = D322_Biop_Nas1[:, D322_Biop_Nas1.var['ribo_genes']]
D322_Biop_Nas1
View of AnnData object with n_obs × n_vars = 1780 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D339_Biop_Nas1 = sc.read_10x_mtx(
'./D339_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Nas1.var_names_make_unique()
D339_Biop_Nas1.obs['manip'] = 'D339_Biop_Nas1'
D339_Biop_Nas1.obs['position'] = 'Nasal'
D339_Biop_Nas1.obs['method'] = 'Biopsy'
D339_Biop_Nas1.obs['donor'] = 'D339'
D339_Biop_Nas1.obs['name'] = ['D339_Biop_Nas1_' + s for s in list(D339_Biop_Nas1.obs.index)]
D339_Biop_Nas1.obs_names = D339_Biop_Nas1.obs['name']
D339_Biop_Nas1
... reading from cache file ./cache/D339_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1917 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D339_Biop_Nas1, n_top=20)
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=0)
mito_genes = D339_Biop_Nas1.var_names.str.startswith('MT-')
D339_Biop_Nas1.obs['percent_mito'] = np.sum(
D339_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.obs['n_counts'] = D339_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Nas1.to_df())
ribo_genes = D339_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Nas1.obs['percent_ribo'] = np.sum(
D339_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=500)
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['n_counts'] < 40000, :]
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['percent_mito'] < 0.15, :]
filtered out 3 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D339_Biop_Nas1.X, expected_doublet_rate=0.016)
scrub = scr.Scrublet(D339_Biop_Nas1.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D339_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.47 Detected doublet rate = 0.5% Estimated detectable doublet fraction = 41.5% Overall doublet rate: Expected = 10.0% Estimated = 1.3% Elapsed time: 1.4 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecaf6ff28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecaf04438>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Nas1.X).predict()
D339_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2066202163696289 seconds Jaccard graph constructed in 0.3158748149871826 seconds Wrote graph to binary file in 0.04000139236450195 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914813 After 3 runs, maximum modularity is Q = 0.916011 Louvain completed 23 runs in 0.8884556293487549 seconds PhenoGraph complete in 1.4616329669952393 seconds Found communities [-1, ... 21], with sizes: [94, 295, 221, 207, 133, 132, 132, 131, 121, 118, 104, 96, 93, 91, 66, 52, 50, 49, 41, 36, 33, 29, 24] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30696773529052734 seconds Jaccard graph constructed in 0.3256070613861084 seconds Wrote graph to binary file in 0.12011384963989258 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914471 Louvain completed 21 runs in 0.7269222736358643 seconds PhenoGraph complete in 1.4904849529266357 seconds Found communities [-1, ... 19], with sizes: [87, 383, 243, 217, 167, 166, 125, 119, 118, 96, 96, 85, 70, 64, 57, 51, 50, 50, 40, 35, 29] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31159067153930664 seconds Jaccard graph constructed in 0.32219552993774414 seconds Wrote graph to binary file in 0.04117703437805176 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917203 Louvain completed 21 runs in 0.7120158672332764 seconds PhenoGraph complete in 1.398862361907959 seconds Found communities [-1, ... 20], with sizes: [92, 265, 231, 159, 156, 155, 145, 112, 112, 110, 109, 97, 94, 92, 84, 71, 56, 49, 48, 43, 35, 33] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081836700439453 seconds Jaccard graph constructed in 0.31020474433898926 seconds Wrote graph to binary file in 0.12661218643188477 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914894 Louvain completed 21 runs in 0.7269814014434814 seconds PhenoGraph complete in 1.4843578338623047 seconds Found communities [-1, ... 19], with sizes: [84, 273, 250, 203, 155, 149, 149, 141, 114, 109, 109, 107, 98, 81, 61, 55, 51, 44, 41, 39, 35] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20717930793762207 seconds Jaccard graph constructed in 0.31676197052001953 seconds Wrote graph to binary file in 0.0416409969329834 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912832 After 2 runs, maximum modularity is Q = 0.914457 After 4 runs, maximum modularity is Q = 0.915946 Louvain completed 24 runs in 1.0415267944335938 seconds PhenoGraph complete in 1.621366024017334 seconds Found communities [-1, ... 20], with sizes: [104, 305, 186, 173, 151, 144, 141, 139, 126, 103, 100, 97, 87, 85, 72, 66, 53, 51, 50, 42, 38, 35] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3072516918182373 seconds Jaccard graph constructed in 0.32131314277648926 seconds Wrote graph to binary file in 0.1205141544342041 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.916629 Louvain completed 21 runs in 0.7400810718536377 seconds PhenoGraph complete in 1.5056416988372803 seconds Found communities [-1, ... 19], with sizes: [82, 338, 226, 178, 145, 142, 142, 131, 112, 112, 104, 92, 91, 86, 85, 61, 52, 51, 45, 39, 34] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20781898498535156 seconds Jaccard graph constructed in 0.3088233470916748 seconds Wrote graph to binary file in 0.04128599166870117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.916194 After 13 runs, maximum modularity is Q = 0.917268 Louvain completed 33 runs in 1.1683826446533203 seconds PhenoGraph complete in 1.7367794513702393 seconds Found communities [-1, ... 21], with sizes: [98, 262, 159, 150, 137, 135, 134, 123, 119, 115, 111, 111, 103, 102, 100, 61, 59, 53, 53, 50, 42, 36, 35] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3067307472229004 seconds Jaccard graph constructed in 0.33370113372802734 seconds Wrote graph to binary file in 0.12057757377624512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913375 After 9 runs, maximum modularity is Q = 0.914495 Louvain completed 29 runs in 1.048865795135498 seconds PhenoGraph complete in 1.826646327972412 seconds Found communities [-1, ... 20], with sizes: [96, 241, 223, 175, 147, 138, 124, 123, 118, 113, 112, 108, 105, 97, 81, 74, 57, 51, 49, 41, 40, 35] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2067573070526123 seconds Jaccard graph constructed in 0.31972408294677734 seconds Wrote graph to binary file in 0.04212474822998047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915562 Louvain completed 21 runs in 0.7448432445526123 seconds PhenoGraph complete in 1.3286876678466797 seconds Found communities [-1, ... 20], with sizes: [103, 293, 183, 169, 167, 151, 145, 143, 109, 108, 108, 99, 87, 81, 71, 54, 53, 51, 50, 47, 41, 35] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20834088325500488 seconds Jaccard graph constructed in 0.322249174118042 seconds Wrote graph to binary file in 0.12173628807067871 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915861 After 4 runs, maximum modularity is Q = 0.917071 Louvain completed 24 runs in 0.8866989612579346 seconds PhenoGraph complete in 1.5576601028442383 seconds Found communities [-1, ... 21], with sizes: [88, 260, 231, 222, 147, 142, 129, 122, 119, 112, 109, 104, 87, 86, 70, 55, 51, 50, 41, 38, 36, 30, 19] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2088916301727295 seconds Jaccard graph constructed in 0.33864760398864746 seconds Wrote graph to binary file in 0.04112577438354492 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915896 Louvain completed 21 runs in 0.7342045307159424 seconds PhenoGraph complete in 1.3354604244232178 seconds Found communities [-1, ... 21], with sizes: [88, 248, 210, 152, 133, 129, 128, 127, 123, 120, 114, 112, 110, 110, 86, 65, 51, 51, 50, 40, 38, 36, 27] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31146860122680664 seconds Jaccard graph constructed in 0.31763625144958496 seconds Wrote graph to binary file in 0.146104097366333 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.916158 Louvain completed 21 runs in 0.7209784984588623 seconds PhenoGraph complete in 1.51529860496521 seconds Found communities [-1, ... 19], with sizes: [112, 247, 223, 204, 168, 158, 144, 144, 123, 115, 109, 107, 89, 71, 70, 52, 52, 49, 41, 40, 30] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30747437477111816 seconds Jaccard graph constructed in 0.3243536949157715 seconds Wrote graph to binary file in 0.04296755790710449 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915712 Louvain completed 21 runs in 0.7537517547607422 seconds PhenoGraph complete in 1.4425067901611328 seconds Found communities [-1, ... 19], with sizes: [96, 310, 212, 211, 168, 160, 141, 136, 108, 108, 93, 87, 86, 80, 77, 59, 52, 51, 43, 36, 34] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30921292304992676 seconds Jaccard graph constructed in 0.33057618141174316 seconds Wrote graph to binary file in 0.15000510215759277 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914577 After 10 runs, maximum modularity is Q = 0.915875 Louvain completed 30 runs in 1.1045057773590088 seconds PhenoGraph complete in 1.91184401512146 seconds Found communities [-1, ... 21], with sizes: [105, 278, 205, 192, 183, 156, 135, 115, 111, 107, 105, 99, 97, 88, 69, 53, 49, 38, 37, 36, 34, 28, 28] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20765280723571777 seconds Jaccard graph constructed in 0.3321225643157959 seconds Wrote graph to binary file in 0.04221796989440918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914789 Louvain completed 21 runs in 0.7014024257659912 seconds PhenoGraph complete in 1.2956562042236328 seconds Found communities [-1, ... 19], with sizes: [84, 304, 218, 218, 135, 132, 132, 121, 115, 113, 113, 103, 96, 93, 89, 59, 54, 51, 43, 40, 35] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30840134620666504 seconds Jaccard graph constructed in 0.31452107429504395 seconds Wrote graph to binary file in 0.041329383850097656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914288 Louvain completed 21 runs in 0.6980297565460205 seconds PhenoGraph complete in 1.3723759651184082 seconds Found communities [-1, ... 20], with sizes: [80, 292, 199, 165, 164, 150, 139, 128, 127, 110, 104, 97, 94, 87, 85, 75, 54, 52, 47, 41, 34, 24] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31139469146728516 seconds Jaccard graph constructed in 0.4412109851837158 seconds Wrote graph to binary file in 0.042226552963256836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917063 Louvain completed 21 runs in 0.7077591419219971 seconds PhenoGraph complete in 1.5150043964385986 seconds Found communities [-1, ... 20], with sizes: [85, 284, 254, 190, 153, 141, 119, 116, 112, 111, 110, 98, 94, 90, 77, 54, 53, 51, 48, 47, 31, 30] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3068504333496094 seconds Jaccard graph constructed in 0.3177778720855713 seconds Wrote graph to binary file in 0.04327201843261719 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913495 After 2 runs, maximum modularity is Q = 0.914576 Louvain completed 22 runs in 0.8603401184082031 seconds PhenoGraph complete in 1.540285587310791 seconds Found communities [-1, ... 20], with sizes: [87, 215, 204, 204, 153, 141, 137, 134, 120, 114, 112, 109, 98, 95, 92, 58, 56, 51, 49, 43, 40, 36] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3102548122406006 seconds Jaccard graph constructed in 0.3326091766357422 seconds Wrote graph to binary file in 0.14270853996276855 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.9174 Louvain completed 21 runs in 0.7318401336669922 seconds PhenoGraph complete in 1.528918981552124 seconds Found communities [-1, ... 20], with sizes: [81, 351, 256, 164, 133, 133, 128, 122, 113, 111, 104, 97, 88, 82, 76, 56, 51, 50, 42, 40, 39, 31] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4072751998901367 seconds Jaccard graph constructed in 0.31981754302978516 seconds Wrote graph to binary file in 0.04160428047180176 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913721 After 9 runs, maximum modularity is Q = 0.915008 Louvain completed 29 runs in 1.0451924800872803 seconds PhenoGraph complete in 1.8252418041229248 seconds Found communities [-1, ... 21], with sizes: [82, 228, 191, 164, 163, 155, 143, 124, 115, 107, 105, 104, 95, 90, 87, 79, 60, 53, 49, 41, 40, 38, 35] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3087790012359619 seconds Jaccard graph constructed in 0.34668397903442383 seconds Wrote graph to binary file in 0.1182863712310791 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91467 Louvain completed 21 runs in 0.7347433567047119 seconds PhenoGraph complete in 1.5200309753417969 seconds Found communities [-1, ... 20], with sizes: [94, 247, 203, 194, 166, 136, 136, 121, 120, 114, 112, 101, 94, 93, 90, 57, 56, 52, 49, 42, 39, 32] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31101465225219727 seconds Jaccard graph constructed in 0.3258686065673828 seconds Wrote graph to binary file in 0.04131960868835449 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913669 Louvain completed 21 runs in 0.7208325862884521 seconds PhenoGraph complete in 1.4099571704864502 seconds Found communities [-1, ... 21], with sizes: [64, 222, 222, 168, 156, 154, 133, 118, 116, 112, 110, 109, 107, 107, 87, 66, 52, 50, 50, 38, 38, 38, 31] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30809998512268066 seconds Jaccard graph constructed in 0.31885600090026855 seconds Wrote graph to binary file in 0.13667058944702148 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.916988 Louvain completed 21 runs in 0.7105915546417236 seconds PhenoGraph complete in 1.4888038635253906 seconds Found communities [-1, ... 21], with sizes: [75, 276, 264, 192, 148, 145, 143, 120, 117, 112, 95, 89, 84, 83, 79, 54, 51, 49, 44, 40, 31, 30, 27] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.308124303817749 seconds Jaccard graph constructed in 0.3518354892730713 seconds Wrote graph to binary file in 0.04296588897705078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913021 After 2 runs, maximum modularity is Q = 0.914831 Louvain completed 22 runs in 0.8846080303192139 seconds PhenoGraph complete in 1.6036791801452637 seconds Found communities [-1, ... 19], with sizes: [96, 304, 238, 206, 160, 156, 151, 125, 110, 108, 94, 89, 89, 82, 71, 56, 53, 50, 39, 38, 33] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30726146697998047 seconds Jaccard graph constructed in 0.3231089115142822 seconds Wrote graph to binary file in 0.12204957008361816 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915153 After 2 runs, maximum modularity is Q = 0.916807 Louvain completed 22 runs in 0.840766191482544 seconds PhenoGraph complete in 1.6069214344024658 seconds Found communities [-1, ... 19], with sizes: [92, 267, 256, 201, 179, 151, 147, 125, 116, 114, 112, 93, 86, 86, 67, 53, 52, 48, 35, 35, 33]
sc.pp.normalize_per_cell(D339_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Nas1) # log transform the data
D339_Biop_Nas1.raw = D339_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D339_Biop_Nas1 = D339_Biop_Nas1[:, D339_Biop_Nas1.var['ribo_genes']]
D339_Biop_Nas1
View of AnnData object with n_obs × n_vars = 1879 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D344_Biop_Nas1 = sc.read_10x_mtx(
'./D344_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Nas1.var_names_make_unique()
D344_Biop_Nas1.obs['manip'] = 'D344_Biop_Nas1'
D344_Biop_Nas1.obs['position'] = 'Nasal'
D344_Biop_Nas1.obs['method'] = 'Biopsy'
D344_Biop_Nas1.obs['donor'] = 'D344'
D344_Biop_Nas1.obs['name'] = ['D344_Biop_Nas1_' + s for s in list(D344_Biop_Nas1.obs.index)]
D344_Biop_Nas1.obs_names = D344_Biop_Nas1.obs['name']
D344_Biop_Nas1
... reading from cache file ./cache/D344_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2121 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D344_Biop_Nas1, n_top=20)
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=0)
mito_genes = D344_Biop_Nas1.var_names.str.startswith('MT-')
D344_Biop_Nas1.obs['percent_mito'] = np.sum(
D344_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.obs['n_counts'] = D344_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Nas1.to_df())
ribo_genes = D344_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Nas1.obs['percent_ribo'] = np.sum(
D344_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=500)
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['n_counts'] < 50000, :]
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['percent_mito'] < 0.1, :]
filtered out 6 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D344_Biop_Nas1.X, expected_doublet_rate=0.017)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D344_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.18 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 28.9% Overall doublet rate: Expected = 1.7% Estimated = 2.3% Elapsed time: 1.5 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbbb5860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecc770668>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Nas1.X).predict()
D344_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3668093681335449 seconds Jaccard graph constructed in 0.37114810943603516 seconds Wrote graph to binary file in 0.04429221153259277 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905021 Louvain completed 21 runs in 0.8386020660400391 seconds PhenoGraph complete in 1.6339306831359863 seconds Found communities [-1, ... 19], with sizes: [183, 344, 283, 240, 182, 176, 175, 132, 124, 117, 98, 86, 86, 72, 71, 60, 58, 55, 36, 23, 17] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30774354934692383 seconds Jaccard graph constructed in 0.37660694122314453 seconds Wrote graph to binary file in 0.0534367561340332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902513 Louvain completed 21 runs in 0.8093357086181641 seconds PhenoGraph complete in 1.5636885166168213 seconds Found communities [-1, ... 18], with sizes: [171, 345, 324, 294, 248, 232, 129, 117, 112, 98, 76, 73, 72, 68, 67, 62, 57, 37, 19, 17] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3166005611419678 seconds Jaccard graph constructed in 0.48200368881225586 seconds Wrote graph to binary file in 0.04665350914001465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903171 Louvain completed 21 runs in 0.8213727474212646 seconds PhenoGraph complete in 1.683635950088501 seconds Found communities [-1, ... 22], with sizes: [132, 389, 363, 171, 156, 151, 142, 120, 104, 102, 101, 87, 85, 75, 74, 67, 63, 60, 51, 42, 33, 18, 17, 15] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3100764751434326 seconds Jaccard graph constructed in 0.35109758377075195 seconds Wrote graph to binary file in 0.04471158981323242 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905114 Louvain completed 21 runs in 0.8178925514221191 seconds PhenoGraph complete in 1.5368318557739258 seconds Found communities [-1, ... 19], with sizes: [140, 374, 329, 249, 244, 217, 180, 118, 100, 97, 92, 86, 68, 66, 57, 54, 50, 40, 23, 18, 16] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3082611560821533 seconds Jaccard graph constructed in 0.48171234130859375 seconds Wrote graph to binary file in 0.043807029724121094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904891 Louvain completed 21 runs in 0.8361577987670898 seconds PhenoGraph complete in 1.6836771965026855 seconds Found communities [-1, ... 19], with sizes: [159, 363, 300, 217, 189, 184, 172, 157, 126, 100, 97, 88, 83, 71, 66, 61, 60, 53, 37, 19, 16] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3137693405151367 seconds Jaccard graph constructed in 0.37295007705688477 seconds Wrote graph to binary file in 0.05366945266723633 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90592 Louvain completed 21 runs in 0.8222959041595459 seconds PhenoGraph complete in 1.579627275466919 seconds Found communities [-1, ... 20], with sizes: [183, 359, 225, 200, 186, 185, 143, 127, 116, 109, 102, 101, 91, 84, 76, 60, 57, 54, 53, 48, 47, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30945873260498047 seconds Jaccard graph constructed in 0.3735170364379883 seconds Wrote graph to binary file in 0.1431713104248047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901523 After 11 runs, maximum modularity is Q = 0.903058 Louvain completed 31 runs in 1.2631654739379883 seconds PhenoGraph complete in 2.102349042892456 seconds Found communities [-1, ... 19], with sizes: [172, 360, 250, 216, 203, 201, 160, 148, 137, 117, 110, 99, 68, 63, 60, 59, 58, 54, 39, 27, 17] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3099653720855713 seconds Jaccard graph constructed in 0.36966371536254883 seconds Wrote graph to binary file in 0.04363894462585449 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905927 Louvain completed 21 runs in 0.8191144466400146 seconds PhenoGraph complete in 1.555978775024414 seconds Found communities [-1, ... 21], with sizes: [151, 346, 265, 253, 177, 174, 149, 136, 130, 103, 101, 96, 86, 69, 66, 57, 57, 56, 56, 35, 27, 17, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30855607986450195 seconds Jaccard graph constructed in 0.35804080963134766 seconds Wrote graph to binary file in 0.14003729820251465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906856 Louvain completed 21 runs in 0.8057575225830078 seconds PhenoGraph complete in 1.623595952987671 seconds Found communities [-1, ... 20], with sizes: [186, 356, 347, 243, 186, 154, 148, 112, 105, 104, 101, 90, 77, 72, 65, 62, 60, 49, 38, 25, 21, 17] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3120462894439697 seconds Jaccard graph constructed in 0.37120652198791504 seconds Wrote graph to binary file in 0.04476356506347656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90531 Louvain completed 21 runs in 0.7785615921020508 seconds PhenoGraph complete in 1.5257573127746582 seconds Found communities [-1, ... 22], with sizes: [179, 232, 212, 212, 203, 190, 186, 148, 110, 107, 107, 103, 100, 95, 68, 61, 60, 53, 50, 49, 35, 22, 20, 16] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30761241912841797 seconds Jaccard graph constructed in 0.3466176986694336 seconds Wrote graph to binary file in 0.14148497581481934 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899856 After 3 runs, maximum modularity is Q = 0.901909 Louvain completed 23 runs in 0.9796733856201172 seconds PhenoGraph complete in 1.7867050170898438 seconds Found communities [-1, ... 19], with sizes: [183, 344, 238, 219, 210, 169, 159, 149, 123, 112, 98, 94, 77, 77, 76, 59, 59, 58, 56, 42, 16] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3099222183227539 seconds Jaccard graph constructed in 0.35921549797058105 seconds Wrote graph to binary file in 0.044771432876586914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904101 Louvain completed 21 runs in 0.8093023300170898 seconds PhenoGraph complete in 1.5360081195831299 seconds Found communities [-1, ... 21], with sizes: [178, 352, 261, 213, 167, 166, 143, 134, 113, 109, 100, 90, 80, 78, 76, 72, 65, 56, 56, 46, 32, 17, 14] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31110095977783203 seconds Jaccard graph constructed in 0.3798489570617676 seconds Wrote graph to binary file in 0.14588046073913574 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903504 Louvain completed 21 runs in 0.8239104747772217 seconds PhenoGraph complete in 1.6797006130218506 seconds Found communities [-1, ... 20], with sizes: [200, 377, 210, 186, 184, 182, 171, 167, 145, 133, 113, 97, 86, 64, 64, 54, 52, 40, 39, 23, 16, 15] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3103766441345215 seconds Jaccard graph constructed in 0.37268805503845215 seconds Wrote graph to binary file in 0.04918384552001953 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902063 Louvain completed 21 runs in 0.8856055736541748 seconds PhenoGraph complete in 1.6465637683868408 seconds Found communities [-1, ... 19], with sizes: [157, 357, 280, 240, 199, 199, 195, 123, 105, 102, 96, 95, 79, 68, 62, 60, 59, 58, 43, 24, 17] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3075556755065918 seconds Jaccard graph constructed in 0.41863036155700684 seconds Wrote graph to binary file in 0.12459564208984375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900123 After 3 runs, maximum modularity is Q = 0.902262 Louvain completed 23 runs in 0.9938900470733643 seconds PhenoGraph complete in 1.8578953742980957 seconds Found communities [-1, ... 18], with sizes: [160, 376, 341, 238, 214, 185, 169, 151, 110, 104, 103, 88, 74, 68, 62, 54, 50, 37, 17, 17] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081660270690918 seconds Jaccard graph constructed in 0.3730311393737793 seconds Wrote graph to binary file in 0.05182766914367676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903571 Louvain completed 21 runs in 0.8278617858886719 seconds PhenoGraph complete in 1.5791656970977783 seconds Found communities [-1, ... 20], with sizes: [166, 331, 236, 225, 205, 202, 193, 143, 106, 103, 98, 90, 84, 66, 65, 63, 61, 51, 50, 35, 23, 22] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3079235553741455 seconds Jaccard graph constructed in 0.36020565032958984 seconds Wrote graph to binary file in 0.12332344055175781 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903427 Louvain completed 21 runs in 0.8188588619232178 seconds PhenoGraph complete in 1.6239402294158936 seconds Found communities [-1, ... 22], with sizes: [153, 349, 197, 196, 196, 189, 168, 146, 124, 106, 101, 85, 83, 75, 75, 71, 66, 59, 50, 47, 38, 17, 14, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3135530948638916 seconds Jaccard graph constructed in 0.3795955181121826 seconds Wrote graph to binary file in 0.04305911064147949 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901231 After 2 runs, maximum modularity is Q = 0.9023 Louvain completed 22 runs in 0.9632749557495117 seconds PhenoGraph complete in 1.7123239040374756 seconds Found communities [-1, ... 19], with sizes: [175, 389, 351, 226, 215, 177, 124, 109, 101, 99, 91, 87, 76, 74, 73, 64, 61, 58, 36, 17, 15] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30730199813842773 seconds Jaccard graph constructed in 0.3530082702636719 seconds Wrote graph to binary file in 0.1405472755432129 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903312 Louvain completed 21 runs in 0.8128602504730225 seconds PhenoGraph complete in 1.6249172687530518 seconds Found communities [-1, ... 20], with sizes: [145, 339, 309, 198, 192, 154, 150, 132, 114, 108, 104, 101, 91, 86, 76, 74, 72, 58, 45, 42, 16, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3109710216522217 seconds Jaccard graph constructed in 0.3767588138580322 seconds Wrote graph to binary file in 0.04238247871398926 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902124 Louvain completed 21 runs in 0.8175806999206543 seconds PhenoGraph complete in 1.560603380203247 seconds Found communities [-1, ... 19], with sizes: [197, 342, 259, 199, 190, 176, 165, 129, 124, 118, 100, 97, 82, 82, 72, 64, 59, 52, 52, 42, 17] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30885910987854004 seconds Jaccard graph constructed in 0.3689866065979004 seconds Wrote graph to binary file in 0.14580225944519043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899958 After 3 runs, maximum modularity is Q = 0.901022 Louvain completed 23 runs in 1.0120658874511719 seconds PhenoGraph complete in 1.8503143787384033 seconds Found communities [-1, ... 20], with sizes: [171, 356, 349, 192, 189, 170, 165, 124, 121, 102, 87, 78, 71, 70, 64, 63, 59, 57, 54, 39, 19, 18] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081333637237549 seconds Jaccard graph constructed in 0.37570738792419434 seconds Wrote graph to binary file in 0.04296064376831055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901547 After 4 runs, maximum modularity is Q = 0.902717 Louvain completed 24 runs in 1.0371849536895752 seconds PhenoGraph complete in 1.7782173156738281 seconds Found communities [-1, ... 22], with sizes: [179, 327, 270, 183, 181, 178, 144, 128, 101, 95, 95, 93, 88, 82, 68, 67, 61, 58, 53, 53, 39, 33, 23, 19] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30927157402038574 seconds Jaccard graph constructed in 0.37598204612731934 seconds Wrote graph to binary file in 0.12826943397521973 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903794 After 2 runs, maximum modularity is Q = 0.905582 Louvain completed 22 runs in 0.9593186378479004 seconds PhenoGraph complete in 1.786625862121582 seconds Found communities [-1, ... 18], with sizes: [193, 376, 348, 241, 197, 166, 145, 132, 115, 102, 99, 84, 75, 73, 71, 58, 57, 42, 27, 17] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30988073348999023 seconds Jaccard graph constructed in 0.3881664276123047 seconds Wrote graph to binary file in 0.0462191104888916 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904599 Louvain completed 21 runs in 0.8111028671264648 seconds PhenoGraph complete in 1.5830721855163574 seconds Found communities [-1, ... 20], with sizes: [131, 335, 323, 264, 199, 170, 166, 133, 115, 110, 89, 89, 71, 70, 68, 60, 59, 56, 56, 19, 19, 16] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081212043762207 seconds Jaccard graph constructed in 0.36143040657043457 seconds Wrote graph to binary file in 0.14709806442260742 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902482 Louvain completed 21 runs in 0.8033449649810791 seconds PhenoGraph complete in 1.6391990184783936 seconds Found communities [-1, ... 18], with sizes: [164, 359, 308, 215, 212, 190, 161, 156, 154, 100, 97, 93, 73, 66, 66, 65, 61, 43, 18, 17]
sc.pp.normalize_per_cell(D344_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Nas1) # log transform the data
D344_Biop_Nas1.raw = D344_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D344_Biop_Nas1 = D344_Biop_Nas1[:, D344_Biop_Nas1.var['ribo_genes']]
D344_Biop_Nas1
View of AnnData object with n_obs × n_vars = 2095 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D345_Biop_Nas1 = sc.read_10x_mtx(
'./D345_Biop_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D345_Biop_Nas1.var_names_make_unique()
D345_Biop_Nas1.obs['manip'] = 'D345_Biop_Nas1'
D345_Biop_Nas1.obs['position'] = 'Nasal'
D345_Biop_Nas1.obs['method'] = 'Biopsy'
D345_Biop_Nas1.obs['donor'] = 'D345'
D345_Biop_Nas1.obs['name'] = ['D345_Biop_Nas1_' + s for s in list(D345_Biop_Nas1.obs.index)]
D345_Biop_Nas1.obs_names = D345_Biop_Nas1.obs['name']
D345_Biop_Nas1
... reading from cache file ./cache/D345_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 3259 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D345_Biop_Nas1, n_top=20)
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=0)
mito_genes = D345_Biop_Nas1.var_names.str.startswith('MT-')
D345_Biop_Nas1.obs['percent_mito'] = np.sum(
D345_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.obs['n_counts'] = D345_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D345_Biop_Nas1.to_df())
ribo_genes = D345_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D345_Biop_Nas1.obs['percent_ribo'] = np.sum(
D345_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D345_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=500)
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['n_counts'] < 20000, :]
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['percent_mito'] < 0.2, :]
filtered out 32 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D345_Biop_Nas1.X, expected_doublet_rate=0.025)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D345_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D345_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.20 Detected doublet rate = 1.0% Estimated detectable doublet fraction = 33.4% Overall doublet rate: Expected = 2.5% Estimated = 3.0% Elapsed time: 2.1 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb8d51208>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecc4fa518>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D345_Biop_Nas1.X).predict()
D345_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.509113073348999 seconds Jaccard graph constructed in 0.47003698348999023 seconds Wrote graph to binary file in 0.16583967208862305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875367 Louvain completed 21 runs in 1.0838100910186768 seconds PhenoGraph complete in 2.2446818351745605 seconds Found communities [-1, ... 20], with sizes: [55, 513, 452, 370, 293, 292, 270, 249, 240, 213, 196, 129, 123, 122, 110, 95, 64, 64, 58, 57, 53, 13] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.509962797164917 seconds Jaccard graph constructed in 0.4884061813354492 seconds Wrote graph to binary file in 0.07520413398742676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876782 After 8 runs, maximum modularity is Q = 0.877922 Louvain completed 28 runs in 1.4965903759002686 seconds PhenoGraph complete in 2.58528733253479 seconds Found communities [-1, ... 19], with sizes: [60, 682, 475, 397, 327, 290, 257, 251, 237, 198, 137, 136, 105, 82, 73, 72, 66, 61, 56, 55, 14] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5157821178436279 seconds Jaccard graph constructed in 0.6081719398498535 seconds Wrote graph to binary file in 0.07874584197998047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877342 After 3 runs, maximum modularity is Q = 0.879469 Louvain completed 23 runs in 1.3534622192382812 seconds PhenoGraph complete in 2.5744004249572754 seconds Found communities [-1, ... 20], with sizes: [54, 499, 459, 418, 318, 288, 266, 261, 243, 232, 214, 135, 116, 110, 100, 73, 72, 60, 52, 37, 13, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5116434097290039 seconds Jaccard graph constructed in 0.48578476905822754 seconds Wrote graph to binary file in 0.2102823257446289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878289 After 2 runs, maximum modularity is Q = 0.879619 After 6 runs, maximum modularity is Q = 0.881189 Louvain completed 26 runs in 1.5322465896606445 seconds PhenoGraph complete in 2.75622820854187 seconds Found communities [-1, ... 20], with sizes: [76, 444, 394, 375, 344, 277, 276, 261, 247, 233, 194, 177, 136, 102, 102, 92, 91, 71, 61, 52, 13, 13] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.509721040725708 seconds Jaccard graph constructed in 0.4993550777435303 seconds Wrote graph to binary file in 0.16241002082824707 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875284 After 8 runs, maximum modularity is Q = 0.876389 After 14 runs, maximum modularity is Q = 0.877973 Louvain completed 34 runs in 1.9067745208740234 seconds PhenoGraph complete in 3.0941972732543945 seconds Found communities [-1, ... 20], with sizes: [57, 518, 468, 446, 380, 342, 325, 248, 243, 206, 122, 119, 102, 94, 66, 64, 60, 59, 56, 25, 20, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5105977058410645 seconds Jaccard graph constructed in 0.48778533935546875 seconds Wrote graph to binary file in 0.16517400741577148 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874687 After 21 runs, maximum modularity is Q = 0.875736 Louvain completed 41 runs in 2.0271809101104736 seconds PhenoGraph complete in 3.2078487873077393 seconds Found communities [-1, ... 19], with sizes: [61, 515, 482, 407, 368, 330, 318, 264, 241, 210, 190, 135, 133, 101, 75, 58, 54, 41, 23, 14, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5117900371551514 seconds Jaccard graph constructed in 0.5218122005462646 seconds Wrote graph to binary file in 0.1895432472229004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872572 After 2 runs, maximum modularity is Q = 0.874746 After 9 runs, maximum modularity is Q = 0.876278 Louvain completed 29 runs in 1.7165610790252686 seconds PhenoGraph complete in 2.9558792114257812 seconds Found communities [-1, ... 20], with sizes: [45, 646, 482, 314, 308, 282, 265, 254, 240, 196, 185, 136, 115, 114, 91, 81, 70, 67, 63, 51, 13, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5092618465423584 seconds Jaccard graph constructed in 0.5011942386627197 seconds Wrote graph to binary file in 0.07474088668823242 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877227 After 4 runs, maximum modularity is Q = 0.878352 Louvain completed 24 runs in 1.3611412048339844 seconds PhenoGraph complete in 2.462210178375244 seconds Found communities [-1, ... 19], with sizes: [58, 479, 421, 416, 335, 325, 287, 274, 259, 186, 179, 143, 141, 137, 92, 64, 61, 57, 52, 51, 14] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5097126960754395 seconds Jaccard graph constructed in 0.473783016204834 seconds Wrote graph to binary file in 0.18263554573059082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876872 After 2 runs, maximum modularity is Q = 0.878572 Louvain completed 22 runs in 1.2276060581207275 seconds PhenoGraph complete in 2.415370225906372 seconds Found communities [-1, ... 19], with sizes: [52, 529, 466, 438, 381, 340, 273, 265, 198, 177, 134, 131, 129, 116, 110, 85, 69, 59, 53, 14, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5107555389404297 seconds Jaccard graph constructed in 0.49406886100769043 seconds Wrote graph to binary file in 0.17885661125183105 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882089 Louvain completed 21 runs in 1.0463809967041016 seconds PhenoGraph complete in 2.2454686164855957 seconds Found communities [-1, ... 19], with sizes: [52, 492, 457, 434, 311, 310, 305, 302, 266, 194, 182, 140, 117, 105, 82, 77, 65, 60, 54, 14, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5092160701751709 seconds Jaccard graph constructed in 0.49118804931640625 seconds Wrote graph to binary file in 0.16657304763793945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875191 After 2 runs, maximum modularity is Q = 0.876792 Louvain completed 22 runs in 1.2382164001464844 seconds PhenoGraph complete in 2.4236416816711426 seconds Found communities [-1, ... 19], with sizes: [41, 604, 448, 368, 343, 342, 325, 291, 256, 230, 192, 142, 99, 73, 69, 64, 54, 37, 27, 15, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5095369815826416 seconds Jaccard graph constructed in 0.509570837020874 seconds Wrote graph to binary file in 0.19415497779846191 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877646 After 11 runs, maximum modularity is Q = 0.879248 Louvain completed 31 runs in 1.5875606536865234 seconds PhenoGraph complete in 2.8171303272247314 seconds Found communities [-1, ... 18], with sizes: [52, 485, 438, 436, 340, 329, 313, 310, 244, 189, 164, 162, 136, 102, 72, 67, 64, 61, 52, 15] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5167100429534912 seconds Jaccard graph constructed in 0.5146276950836182 seconds Wrote graph to binary file in 0.07436084747314453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877752 Louvain completed 21 runs in 1.036419153213501 seconds PhenoGraph complete in 2.167802095413208 seconds Found communities [-1, ... 20], with sizes: [61, 485, 338, 315, 300, 299, 291, 281, 272, 246, 226, 162, 152, 143, 108, 94, 66, 59, 55, 51, 15, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5120022296905518 seconds Jaccard graph constructed in 0.4786539077758789 seconds Wrote graph to binary file in 0.18381333351135254 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876809 Louvain completed 21 runs in 1.106832504272461 seconds PhenoGraph complete in 2.2989614009857178 seconds Found communities [-1, ... 18], with sizes: [53, 581, 560, 413, 306, 289, 289, 282, 243, 194, 158, 130, 115, 112, 105, 67, 54, 54, 14, 12] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5097548961639404 seconds Jaccard graph constructed in 0.4959716796875 seconds Wrote graph to binary file in 0.16306304931640625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876634 After 2 runs, maximum modularity is Q = 0.878518 Louvain completed 22 runs in 1.2712428569793701 seconds PhenoGraph complete in 2.455409526824951 seconds Found communities [-1, ... 18], with sizes: [63, 643, 543, 482, 335, 310, 272, 258, 209, 134, 128, 119, 115, 96, 90, 77, 67, 60, 17, 13] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5097231864929199 seconds Jaccard graph constructed in 0.47243189811706543 seconds Wrote graph to binary file in 0.18134784698486328 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877117 After 4 runs, maximum modularity is Q = 0.878692 Louvain completed 24 runs in 1.3584632873535156 seconds PhenoGraph complete in 2.536989450454712 seconds Found communities [-1, ... 19], with sizes: [70, 508, 489, 384, 335, 328, 328, 303, 261, 222, 174, 157, 117, 95, 59, 57, 55, 37, 24, 15, 13] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.509711742401123 seconds Jaccard graph constructed in 0.4906728267669678 seconds Wrote graph to binary file in 0.18431615829467773 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874887 After 2 runs, maximum modularity is Q = 0.877135 After 14 runs, maximum modularity is Q = 0.878304 Louvain completed 34 runs in 1.9109795093536377 seconds PhenoGraph complete in 3.111431121826172 seconds Found communities [-1, ... 21], with sizes: [73, 471, 432, 415, 398, 330, 258, 253, 241, 233, 143, 132, 119, 117, 79, 68, 57, 56, 54, 53, 21, 15, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5172734260559082 seconds Jaccard graph constructed in 0.47579026222229004 seconds Wrote graph to binary file in 0.07378578186035156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.879685 Louvain completed 21 runs in 1.0910844802856445 seconds PhenoGraph complete in 2.1752805709838867 seconds Found communities [-1, ... 20], with sizes: [54, 477, 477, 437, 380, 323, 284, 277, 233, 142, 138, 132, 129, 107, 98, 86, 63, 60, 57, 52, 13, 12] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5105595588684082 seconds Jaccard graph constructed in 0.48531103134155273 seconds Wrote graph to binary file in 0.18321013450622559 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874996 After 2 runs, maximum modularity is Q = 0.876336 After 3 runs, maximum modularity is Q = 0.877998 Louvain completed 23 runs in 1.5135843753814697 seconds PhenoGraph complete in 2.7096638679504395 seconds Found communities [-1, ... 19], with sizes: [59, 467, 458, 422, 324, 307, 296, 276, 272, 194, 188, 178, 138, 102, 77, 72, 60, 58, 55, 17, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5126919746398926 seconds Jaccard graph constructed in 0.4905238151550293 seconds Wrote graph to binary file in 0.17134785652160645 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878769 Louvain completed 21 runs in 1.047325849533081 seconds PhenoGraph complete in 2.2437732219696045 seconds Found communities [-1, ... 17], with sizes: [78, 473, 441, 418, 318, 309, 305, 279, 258, 243, 188, 156, 138, 97, 91, 90, 76, 58, 15] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5183579921722412 seconds Jaccard graph constructed in 0.4668898582458496 seconds Wrote graph to binary file in 0.16524100303649902 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878542 After 2 runs, maximum modularity is Q = 0.879898 After 4 runs, maximum modularity is Q = 0.881055 After 5 runs, maximum modularity is Q = 0.88249 Louvain completed 25 runs in 1.5979628562927246 seconds PhenoGraph complete in 2.766202688217163 seconds Found communities [-1, ... 19], with sizes: [71, 501, 467, 314, 314, 285, 283, 262, 261, 255, 208, 207, 144, 96, 77, 76, 67, 67, 50, 13, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5119054317474365 seconds Jaccard graph constructed in 0.49642109870910645 seconds Wrote graph to binary file in 0.19015765190124512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877089 After 8 runs, maximum modularity is Q = 0.878562 Louvain completed 28 runs in 1.5239088535308838 seconds PhenoGraph complete in 2.7475368976593018 seconds Found communities [-1, ... 18], with sizes: [57, 682, 475, 356, 333, 271, 267, 233, 226, 207, 193, 166, 129, 121, 79, 67, 59, 55, 41, 14] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5094122886657715 seconds Jaccard graph constructed in 0.49228978157043457 seconds Wrote graph to binary file in 0.07393264770507812 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87725 After 3 runs, maximum modularity is Q = 0.878287 Louvain completed 23 runs in 1.330054521560669 seconds PhenoGraph complete in 2.421684980392456 seconds Found communities [-1, ... 19], with sizes: [70, 511, 425, 404, 349, 325, 306, 295, 271, 247, 189, 139, 100, 71, 70, 69, 62, 56, 47, 14, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5103309154510498 seconds Jaccard graph constructed in 0.4899752140045166 seconds Wrote graph to binary file in 0.1648874282836914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877486 After 9 runs, maximum modularity is Q = 0.878707 Louvain completed 29 runs in 1.5609664916992188 seconds PhenoGraph complete in 2.74412202835083 seconds Found communities [-1, ... 19], with sizes: [68, 458, 440, 371, 369, 349, 278, 267, 246, 240, 193, 130, 119, 114, 75, 73, 70, 60, 54, 44, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5118117332458496 seconds Jaccard graph constructed in 0.48421311378479004 seconds Wrote graph to binary file in 0.18859124183654785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878439 After 3 runs, maximum modularity is Q = 0.880161 Louvain completed 23 runs in 1.2550854682922363 seconds PhenoGraph complete in 2.455286979675293 seconds Found communities [-1, ... 20], with sizes: [55, 486, 470, 358, 323, 296, 284, 253, 243, 211, 171, 146, 132, 131, 101, 75, 74, 69, 66, 61, 13, 13]
sc.pp.normalize_per_cell(D345_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D345_Biop_Nas1) # log transform the data
D345_Biop_Nas1.raw = D345_Biop_Nas1 # freeze the object (for later use of the raw state of it)
D345_Biop_Nas1 = D345_Biop_Nas1[:, D345_Biop_Nas1.var['ribo_genes']]
D345_Biop_Nas1
View of AnnData object with n_obs × n_vars = 3225 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D353_Brus_Nas1 = sc.read_10x_mtx(
'./D353_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Brus_Nas1.var_names_make_unique()
D353_Brus_Nas1.obs['manip'] = 'D353_Brus_Nas1'
D353_Brus_Nas1.obs['position'] = 'Nasal'
D353_Brus_Nas1.obs['method'] = 'Brushing'
D353_Brus_Nas1.obs['donor'] = 'D353'
D353_Brus_Nas1.obs['name'] = ['D353_Brus_Nas1_' + s for s in list(D353_Brus_Nas1.obs.index)]
D353_Brus_Nas1.obs_names = D353_Brus_Nas1.obs['name']
D353_Brus_Nas1
... reading from cache file ./cache/D353_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 5154 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D353_Brus_Nas1, n_top=20)
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=0)
mito_genes = D353_Brus_Nas1.var_names.str.startswith('MT-')
D353_Brus_Nas1.obs['percent_mito'] = np.sum(
D353_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.obs['n_counts'] = D353_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Nas1.to_df())
ribo_genes = D353_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Nas1.obs['percent_ribo'] = np.sum(
D353_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=500)
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['n_counts'] < 40000, :]
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['percent_mito'] < 0.5, :]
filtered out 3 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D353_Brus_Nas1.X, expected_doublet_rate=0.04)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D353_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.47 Detected doublet rate = 0.1% Estimated detectable doublet fraction = 10.8% Overall doublet rate: Expected = 4.0% Estimated = 0.9% Elapsed time: 5.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecb02bc50>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecafefd30>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Brus_Nas1.X).predict()
D353_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2127726078033447 seconds Jaccard graph constructed in 0.7197434902191162 seconds Wrote graph to binary file in 0.25298023223876953 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909195 After 2 runs, maximum modularity is Q = 0.911025 Louvain completed 22 runs in 1.7851529121398926 seconds PhenoGraph complete in 3.994328022003174 seconds Found communities [-1, ... 26], with sizes: [42, 780, 676, 492, 439, 415, 408, 329, 310, 298, 291, 286, 216, 207, 155, 153, 130, 120, 115, 100, 98, 70, 69, 65, 51, 41, 36, 21] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.7255289554595947 seconds Jaccard graph constructed in 0.927344560623169 seconds Wrote graph to binary file in 0.24228453636169434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910609 Louvain completed 21 runs in 1.581740379333496 seconds PhenoGraph complete in 4.498773574829102 seconds Found communities [-1, ... 25], with sizes: [51, 999, 539, 471, 452, 436, 387, 315, 309, 298, 269, 267, 243, 215, 174, 148, 148, 144, 102, 87, 79, 69, 64, 53, 36, 29, 29] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2243099212646484 seconds Jaccard graph constructed in 0.6852452754974365 seconds Wrote graph to binary file in 0.24946808815002441 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910357 Louvain completed 21 runs in 1.5184452533721924 seconds PhenoGraph complete in 3.697181463241577 seconds Found communities [-1, ... 28], with sizes: [61, 606, 479, 471, 436, 429, 417, 367, 332, 295, 292, 283, 215, 213, 208, 205, 186, 156, 145, 91, 90, 73, 69, 61, 60, 52, 39, 36, 28, 18] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2142560482025146 seconds Jaccard graph constructed in 0.6664714813232422 seconds Wrote graph to binary file in 0.23502373695373535 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909123 After 2 runs, maximum modularity is Q = 0.911097 Louvain completed 22 runs in 1.88018798828125 seconds PhenoGraph complete in 4.01696252822876 seconds Found communities [-1, ... 28], with sizes: [49, 710, 629, 550, 488, 403, 388, 308, 306, 295, 285, 263, 261, 233, 182, 155, 141, 136, 90, 73, 70, 68, 65, 55, 52, 44, 36, 34, 29, 15] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2145659923553467 seconds Jaccard graph constructed in 0.8374745845794678 seconds Wrote graph to binary file in 0.21868538856506348 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910927 Louvain completed 21 runs in 1.5645432472229004 seconds PhenoGraph complete in 3.8568105697631836 seconds Found communities [-1, ... 25], with sizes: [38, 925, 796, 689, 562, 430, 322, 271, 249, 231, 216, 201, 191, 172, 169, 145, 130, 110, 90, 90, 87, 70, 58, 57, 42, 36, 36] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.214392900466919 seconds Jaccard graph constructed in 0.6968972682952881 seconds Wrote graph to binary file in 0.24641752243041992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909486 Louvain completed 21 runs in 1.521557331085205 seconds PhenoGraph complete in 3.7022786140441895 seconds Found communities [-1, ... 26], with sizes: [37, 1023, 743, 450, 431, 400, 342, 304, 284, 238, 236, 223, 213, 206, 188, 180, 149, 144, 114, 90, 86, 70, 67, 56, 42, 37, 37, 23] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2143833637237549 seconds Jaccard graph constructed in 0.737191915512085 seconds Wrote graph to binary file in 0.2756507396697998 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911245 Louvain completed 21 runs in 1.66801118850708 seconds PhenoGraph complete in 3.927262783050537 seconds Found communities [-1, ... 27], with sizes: [45, 1077, 539, 453, 436, 410, 393, 328, 307, 307, 268, 256, 218, 211, 169, 147, 130, 109, 99, 91, 70, 64, 57, 53, 50, 39, 36, 28, 23] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2155818939208984 seconds Jaccard graph constructed in 0.836904764175415 seconds Wrote graph to binary file in 0.22152042388916016 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909791 After 12 runs, maximum modularity is Q = 0.910807 Louvain completed 32 runs in 2.4536595344543457 seconds PhenoGraph complete in 4.7473509311676025 seconds Found communities [-1, ... 29], with sizes: [52, 876, 626, 487, 417, 408, 391, 300, 275, 264, 220, 219, 219, 194, 184, 173, 156, 151, 142, 92, 88, 76, 70, 65, 57, 51, 48, 36, 26, 25, 25] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2221226692199707 seconds Jaccard graph constructed in 0.740626335144043 seconds Wrote graph to binary file in 0.3226799964904785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911461 Louvain completed 21 runs in 1.550100564956665 seconds PhenoGraph complete in 3.8605575561523438 seconds Found communities [-1, ... 27], with sizes: [43, 970, 605, 522, 467, 410, 316, 306, 301, 285, 281, 244, 244, 185, 155, 143, 129, 127, 100, 91, 76, 71, 70, 68, 54, 51, 37, 36, 26] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2145960330963135 seconds Jaccard graph constructed in 0.7350647449493408 seconds Wrote graph to binary file in 0.262676477432251 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906941 After 5 runs, maximum modularity is Q = 0.908376 Louvain completed 25 runs in 2.062347650527954 seconds PhenoGraph complete in 4.295223236083984 seconds Found communities [-1, ... 24], with sizes: [54, 1001, 686, 550, 549, 391, 337, 334, 294, 284, 274, 238, 222, 203, 155, 141, 135, 101, 94, 91, 68, 67, 52, 40, 39, 13] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2217864990234375 seconds Jaccard graph constructed in 0.7268059253692627 seconds Wrote graph to binary file in 0.24543118476867676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907168 After 3 runs, maximum modularity is Q = 0.908238 Louvain completed 23 runs in 1.8657281398773193 seconds PhenoGraph complete in 4.082415819168091 seconds Found communities [-1, ... 26], with sizes: [42, 761, 632, 631, 467, 418, 362, 356, 271, 262, 262, 259, 231, 222, 204, 143, 139, 119, 101, 86, 85, 74, 71, 67, 53, 39, 39, 17] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2191917896270752 seconds Jaccard graph constructed in 0.8307540416717529 seconds Wrote graph to binary file in 0.21717500686645508 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908685 After 5 runs, maximum modularity is Q = 0.909771 Louvain completed 25 runs in 1.980724573135376 seconds PhenoGraph complete in 4.268359899520874 seconds Found communities [-1, ... 28], with sizes: [62, 830, 716, 523, 424, 400, 312, 287, 265, 258, 252, 244, 234, 221, 212, 151, 139, 130, 118, 99, 90, 72, 69, 65, 64, 53, 42, 36, 25, 20] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.21579909324646 seconds Jaccard graph constructed in 0.7258813381195068 seconds Wrote graph to binary file in 0.24634504318237305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909735 Louvain completed 21 runs in 1.53537917137146 seconds PhenoGraph complete in 3.7452125549316406 seconds Found communities [-1, ... 28], with sizes: [48, 648, 632, 494, 433, 411, 397, 317, 307, 292, 286, 248, 227, 222, 216, 202, 152, 137, 128, 90, 87, 70, 69, 62, 55, 52, 50, 36, 23, 22] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2153537273406982 seconds Jaccard graph constructed in 0.7558958530426025 seconds Wrote graph to binary file in 0.24044060707092285 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908969 Louvain completed 21 runs in 1.5639097690582275 seconds PhenoGraph complete in 3.8045454025268555 seconds Found communities [-1, ... 27], with sizes: [48, 899, 628, 546, 484, 417, 306, 293, 292, 270, 243, 240, 224, 215, 188, 187, 154, 116, 93, 91, 89, 75, 70, 64, 53, 44, 36, 34, 14] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.214221477508545 seconds Jaccard graph constructed in 0.8624017238616943 seconds Wrote graph to binary file in 0.21771502494812012 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910122 Louvain completed 21 runs in 1.6675090789794922 seconds PhenoGraph complete in 3.9836511611938477 seconds Found communities [-1, ... 25], with sizes: [61, 964, 700, 549, 469, 469, 381, 368, 275, 251, 230, 214, 206, 200, 157, 153, 128, 104, 94, 85, 70, 67, 57, 54, 43, 37, 27] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2226972579956055 seconds Jaccard graph constructed in 0.7140727043151855 seconds Wrote graph to binary file in 0.24596619606018066 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909802 Louvain completed 21 runs in 1.6083636283874512 seconds PhenoGraph complete in 3.81315541267395 seconds Found communities [-1, ... 26], with sizes: [35, 974, 779, 712, 516, 402, 332, 316, 295, 275, 217, 196, 191, 152, 138, 137, 112, 101, 86, 77, 72, 64, 62, 54, 42, 36, 29, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2138848304748535 seconds Jaccard graph constructed in 0.7432422637939453 seconds Wrote graph to binary file in 0.24086976051330566 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910733 Louvain completed 21 runs in 1.567589282989502 seconds PhenoGraph complete in 3.7843167781829834 seconds Found communities [-1, ... 26], with sizes: [49, 931, 757, 551, 466, 390, 358, 302, 279, 236, 233, 224, 209, 199, 196, 161, 152, 116, 94, 89, 72, 71, 62, 62, 55, 38, 36, 25] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2250697612762451 seconds Jaccard graph constructed in 0.9474897384643555 seconds Wrote graph to binary file in 0.23464679718017578 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907892 After 5 runs, maximum modularity is Q = 0.909824 Louvain completed 25 runs in 2.123487949371338 seconds PhenoGraph complete in 4.55275559425354 seconds Found communities [-1, ... 26], with sizes: [36, 967, 556, 491, 456, 431, 405, 337, 308, 304, 296, 260, 235, 166, 159, 136, 120, 118, 114, 92, 74, 69, 65, 56, 48, 46, 37, 31] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2245111465454102 seconds Jaccard graph constructed in 0.7266659736633301 seconds Wrote graph to binary file in 0.2795066833496094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909389 Louvain completed 21 runs in 1.5866949558258057 seconds PhenoGraph complete in 3.8383898735046387 seconds Found communities [-1, ... 27], with sizes: [55, 774, 595, 513, 488, 419, 405, 308, 307, 285, 240, 236, 235, 233, 168, 156, 148, 124, 117, 102, 90, 70, 69, 60, 58, 52, 44, 36, 26] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3145020008087158 seconds Jaccard graph constructed in 0.7366433143615723 seconds Wrote graph to binary file in 0.2247462272644043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912161 After 2 runs, maximum modularity is Q = 0.913311 Louvain completed 22 runs in 1.7416725158691406 seconds PhenoGraph complete in 4.041547060012817 seconds Found communities [-1, ... 28], with sizes: [46, 976, 546, 492, 453, 411, 360, 325, 287, 274, 231, 229, 217, 206, 175, 153, 149, 145, 117, 91, 75, 69, 69, 64, 63, 53, 37, 37, 36, 27] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2202198505401611 seconds Jaccard graph constructed in 0.7133526802062988 seconds Wrote graph to binary file in 0.24982571601867676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909881 Louvain completed 21 runs in 1.5742614269256592 seconds PhenoGraph complete in 3.7793445587158203 seconds Found communities [-1, ... 28], with sizes: [51, 636, 573, 555, 441, 380, 326, 314, 305, 296, 276, 219, 216, 207, 200, 197, 157, 150, 140, 116, 96, 91, 88, 85, 73, 60, 54, 44, 36, 31] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2174968719482422 seconds Jaccard graph constructed in 0.8321328163146973 seconds Wrote graph to binary file in 0.21767473220825195 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911118 After 4 runs, maximum modularity is Q = 0.912837 Louvain completed 24 runs in 1.9395034313201904 seconds PhenoGraph complete in 4.227894306182861 seconds Found communities [-1, ... 26], with sizes: [39, 1016, 498, 494, 468, 461, 387, 331, 297, 293, 284, 242, 215, 196, 152, 138, 128, 125, 98, 91, 83, 76, 75, 69, 56, 41, 37, 23] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2213521003723145 seconds Jaccard graph constructed in 0.7391245365142822 seconds Wrote graph to binary file in 0.24411273002624512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909974 After 7 runs, maximum modularity is Q = 0.911852 Louvain completed 27 runs in 2.086089611053467 seconds PhenoGraph complete in 4.313715934753418 seconds Found communities [-1, ... 27], with sizes: [39, 853, 520, 455, 453, 444, 416, 370, 318, 291, 287, 268, 243, 210, 172, 156, 151, 129, 102, 91, 82, 69, 62, 54, 51, 36, 34, 29, 28] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2173089981079102 seconds Jaccard graph constructed in 0.7325339317321777 seconds Wrote graph to binary file in 0.24312448501586914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908168 After 2 runs, maximum modularity is Q = 0.909446 Louvain completed 22 runs in 1.746293306350708 seconds PhenoGraph complete in 3.960584878921509 seconds Found communities [-1, ... 27], with sizes: [60, 833, 529, 528, 524, 435, 422, 306, 297, 224, 223, 218, 217, 212, 161, 151, 146, 138, 112, 102, 96, 91, 91, 64, 54, 53, 45, 45, 36] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2166056632995605 seconds Jaccard graph constructed in 0.7211313247680664 seconds Wrote graph to binary file in 0.2405107021331787 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910063 After 11 runs, maximum modularity is Q = 0.911116 Louvain completed 31 runs in 2.39713978767395 seconds PhenoGraph complete in 4.596807241439819 seconds Found communities [-1, ... 25], with sizes: [49, 804, 790, 456, 403, 395, 375, 343, 320, 312, 241, 224, 215, 212, 206, 162, 156, 134, 115, 91, 90, 74, 59, 55, 52, 44, 36]
sc.pp.normalize_per_cell(D353_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Nas1) # log transform the data
D353_Brus_Nas1.raw = D353_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D353_Brus_Nas1 = D353_Brus_Nas1[:, D353_Brus_Nas1.var['ribo_genes']]
D353_Brus_Nas1
View of AnnData object with n_obs × n_vars = 5131 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D363_Brus_Nas1 = sc.read_10x_mtx(
'./D363_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Brus_Nas1.var_names_make_unique()
D363_Brus_Nas1.obs['manip'] = 'D363_Brus_Nas1'
D363_Brus_Nas1.obs['position'] = 'Nasal'
D363_Brus_Nas1.obs['method'] = 'Brushing'
D363_Brus_Nas1.obs['donor'] = 'D363'
D363_Brus_Nas1.obs['name'] = ['D363_Brus_Nas1_' + s for s in list(D363_Brus_Nas1.obs.index)]
D363_Brus_Nas1.obs_names = D363_Brus_Nas1.obs['name']
D363_Brus_Nas1
... reading from cache file ./cache/D363_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 3505 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D363_Brus_Nas1, n_top=20)
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=0)
mito_genes = D363_Brus_Nas1.var_names.str.startswith('MT-')
D363_Brus_Nas1.obs['percent_mito'] = np.sum(
D363_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.obs['n_counts'] = D363_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Nas1.to_df())
ribo_genes = D363_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Nas1.obs['percent_ribo'] = np.sum(
D363_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=500)
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['n_counts'] < 30000, :]
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['percent_mito'] < 0.5, :]
filtered out 1 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D363_Brus_Nas1.X, expected_doublet_rate=0.027)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D363_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.34 Detected doublet rate = 0.2% Estimated detectable doublet fraction = 6.1% Overall doublet rate: Expected = 2.7% Estimated = 2.8% Elapsed time: 3.1 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb88855f8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbfc75c0>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Brus_Nas1.X).predict()
D363_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.011885166168213 seconds Jaccard graph constructed in 0.5874404907226562 seconds Wrote graph to binary file in 0.21593761444091797 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895554 After 2 runs, maximum modularity is Q = 0.896566 Louvain completed 22 runs in 1.470625638961792 seconds PhenoGraph complete in 3.3034350872039795 seconds Found communities [-1, ... 20], with sizes: [93, 554, 551, 513, 398, 285, 271, 256, 205, 167, 166, 141, 136, 106, 92, 86, 85, 83, 80, 52, 27, 15] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0164003372192383 seconds Jaccard graph constructed in 0.542255163192749 seconds Wrote graph to binary file in 0.188262939453125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897149 After 7 runs, maximum modularity is Q = 0.898404 Louvain completed 27 runs in 1.6902661323547363 seconds PhenoGraph complete in 3.4556806087493896 seconds Found communities [-1, ... 21], with sizes: [115, 531, 489, 449, 394, 357, 325, 288, 230, 168, 163, 145, 144, 89, 85, 80, 79, 78, 52, 42, 26, 20, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9105620384216309 seconds Jaccard graph constructed in 0.5442867279052734 seconds Wrote graph to binary file in 0.1835627555847168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892773 After 16 runs, maximum modularity is Q = 0.89382 Louvain completed 36 runs in 2.1753244400024414 seconds PhenoGraph complete in 3.8374288082122803 seconds Found communities [-1, ... 20], with sizes: [89, 594, 547, 507, 505, 383, 278, 236, 173, 166, 142, 131, 93, 92, 88, 83, 79, 73, 52, 26, 13, 12] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.4136576652526855 seconds Jaccard graph constructed in 0.5412983894348145 seconds Wrote graph to binary file in 0.20117902755737305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895441 After 6 runs, maximum modularity is Q = 0.896495 Louvain completed 26 runs in 1.629683017730713 seconds PhenoGraph complete in 3.8024487495422363 seconds Found communities [-1, ... 18], with sizes: [103, 593, 550, 504, 390, 315, 306, 257, 233, 199, 178, 128, 114, 88, 88, 86, 77, 75, 52, 26] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.011387586593628 seconds Jaccard graph constructed in 0.5407369136810303 seconds Wrote graph to binary file in 0.07936930656433105 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894018 After 4 runs, maximum modularity is Q = 0.895078 Louvain completed 24 runs in 1.5311081409454346 seconds PhenoGraph complete in 3.178654670715332 seconds Found communities [-1, ... 18], with sizes: [94, 740, 529, 525, 401, 328, 263, 262, 214, 199, 173, 145, 86, 82, 79, 78, 72, 52, 27, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9134063720703125 seconds Jaccard graph constructed in 0.5180704593658447 seconds Wrote graph to binary file in 0.19541668891906738 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891901 After 5 runs, maximum modularity is Q = 0.892985 Louvain completed 25 runs in 1.6027026176452637 seconds PhenoGraph complete in 3.2467846870422363 seconds Found communities [-1, ... 18], with sizes: [92, 734, 553, 548, 518, 373, 256, 242, 211, 136, 136, 103, 88, 87, 72, 64, 52, 51, 28, 18] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.4157562255859375 seconds Jaccard graph constructed in 0.571800708770752 seconds Wrote graph to binary file in 0.1983933448791504 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892291 After 3 runs, maximum modularity is Q = 0.894405 Louvain completed 23 runs in 1.4698371887207031 seconds PhenoGraph complete in 3.675712823867798 seconds Found communities [-1, ... 17], with sizes: [109, 738, 567, 562, 555, 299, 263, 229, 196, 150, 135, 86, 82, 82, 80, 80, 69, 52, 28] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9214785099029541 seconds Jaccard graph constructed in 0.5363421440124512 seconds Wrote graph to binary file in 0.17616724967956543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892314 Louvain completed 21 runs in 1.248786449432373 seconds PhenoGraph complete in 2.9004197120666504 seconds Found communities [-1, ... 20], with sizes: [99, 617, 528, 517, 444, 296, 262, 253, 205, 171, 165, 153, 143, 91, 87, 85, 73, 69, 52, 27, 14, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9119851589202881 seconds Jaccard graph constructed in 0.52254319190979 seconds Wrote graph to binary file in 0.18080687522888184 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893897 After 2 runs, maximum modularity is Q = 0.895141 Louvain completed 22 runs in 1.4266157150268555 seconds PhenoGraph complete in 3.0582773685455322 seconds Found communities [-1, ... 18], with sizes: [94, 567, 555, 507, 303, 299, 255, 252, 239, 236, 215, 204, 131, 100, 89, 86, 78, 74, 52, 26] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9102835655212402 seconds Jaccard graph constructed in 0.5288314819335938 seconds Wrote graph to binary file in 0.07883858680725098 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895046 Louvain completed 21 runs in 1.214411735534668 seconds PhenoGraph complete in 2.7481415271759033 seconds Found communities [-1, ... 18], with sizes: [96, 531, 525, 509, 463, 338, 271, 219, 217, 199, 193, 161, 143, 94, 87, 85, 81, 72, 52, 26] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3104100227355957 seconds Jaccard graph constructed in 0.5491280555725098 seconds Wrote graph to binary file in 0.08621072769165039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891554 Louvain completed 21 runs in 1.3808362483978271 seconds PhenoGraph complete in 3.344397783279419 seconds Found communities [-1, ... 19], with sizes: [96, 542, 525, 506, 497, 402, 269, 260, 216, 189, 153, 134, 90, 88, 86, 81, 77, 57, 52, 26, 16] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0127100944519043 seconds Jaccard graph constructed in 0.6711044311523438 seconds Wrote graph to binary file in 0.07966256141662598 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89389 After 7 runs, maximum modularity is Q = 0.895132 Louvain completed 27 runs in 1.7397046089172363 seconds PhenoGraph complete in 3.520399570465088 seconds Found communities [-1, ... 17], with sizes: [70, 686, 565, 540, 529, 389, 267, 238, 221, 136, 134, 97, 88, 86, 83, 81, 75, 52, 25] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.913926362991333 seconds Jaccard graph constructed in 0.5575425624847412 seconds Wrote graph to binary file in 0.1962299346923828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891216 Louvain completed 21 runs in 1.2624707221984863 seconds PhenoGraph complete in 2.947330951690674 seconds Found communities [-1, ... 18], with sizes: [87, 715, 554, 553, 429, 326, 271, 208, 202, 194, 175, 136, 94, 88, 87, 80, 68, 53, 26, 16] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.4119586944580078 seconds Jaccard graph constructed in 0.5575528144836426 seconds Wrote graph to binary file in 0.1961507797241211 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893882 After 7 runs, maximum modularity is Q = 0.895206 Louvain completed 27 runs in 1.6580243110656738 seconds PhenoGraph complete in 3.8415257930755615 seconds Found communities [-1, ... 19], with sizes: [98, 566, 545, 512, 399, 311, 274, 260, 195, 192, 186, 185, 138, 94, 86, 83, 74, 67, 52, 25, 20] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0112378597259521 seconds Jaccard graph constructed in 0.5901081562042236 seconds Wrote graph to binary file in 0.20053887367248535 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895551 Louvain completed 21 runs in 1.2312848567962646 seconds PhenoGraph complete in 3.048438787460327 seconds Found communities [-1, ... 18], with sizes: [105, 542, 510, 389, 375, 354, 340, 325, 286, 198, 154, 148, 142, 89, 87, 86, 77, 76, 53, 26] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.4124422073364258 seconds Jaccard graph constructed in 0.5533256530761719 seconds Wrote graph to binary file in 0.2122361660003662 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894506 Louvain completed 21 runs in 1.266465425491333 seconds PhenoGraph complete in 3.4707083702087402 seconds Found communities [-1, ... 19], with sizes: [100, 611, 592, 560, 530, 272, 267, 184, 184, 170, 163, 144, 88, 86, 85, 80, 78, 75, 52, 26, 15] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.91042160987854 seconds Jaccard graph constructed in 0.544696569442749 seconds Wrote graph to binary file in 0.20070505142211914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896103 After 2 runs, maximum modularity is Q = 0.897147 Louvain completed 22 runs in 1.4633309841156006 seconds PhenoGraph complete in 3.1344850063323975 seconds Found communities [-1, ... 19], with sizes: [105, 594, 586, 581, 498, 310, 251, 215, 188, 180, 173, 128, 92, 86, 83, 80, 69, 54, 33, 30, 26] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0117473602294922 seconds Jaccard graph constructed in 0.528874397277832 seconds Wrote graph to binary file in 0.07802391052246094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893695 Louvain completed 21 runs in 1.224231481552124 seconds PhenoGraph complete in 2.8586387634277344 seconds Found communities [-1, ... 19], with sizes: [104, 581, 495, 448, 370, 360, 298, 262, 258, 215, 171, 161, 130, 111, 90, 85, 85, 54, 39, 26, 19] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7101707458496094 seconds Jaccard graph constructed in 0.5220203399658203 seconds Wrote graph to binary file in 0.1974945068359375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892453 After 9 runs, maximum modularity is Q = 0.893926 Louvain completed 29 runs in 1.802060604095459 seconds PhenoGraph complete in 3.2479052543640137 seconds Found communities [-1, ... 19], with sizes: [83, 749, 553, 537, 499, 376, 254, 196, 184, 144, 144, 103, 87, 82, 80, 79, 72, 53, 47, 26, 14] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9182157516479492 seconds Jaccard graph constructed in 0.5311503410339355 seconds Wrote graph to binary file in 0.19744205474853516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895217 After 5 runs, maximum modularity is Q = 0.896632 Louvain completed 25 runs in 1.5544304847717285 seconds PhenoGraph complete in 3.216425657272339 seconds Found communities [-1, ... 19], with sizes: [100, 553, 545, 514, 466, 329, 297, 216, 195, 176, 162, 131, 105, 93, 88, 86, 78, 77, 73, 52, 26] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8139538764953613 seconds Jaccard graph constructed in 0.5388381481170654 seconds Wrote graph to binary file in 0.2006535530090332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891788 After 3 runs, maximum modularity is Q = 0.892803 Louvain completed 23 runs in 1.5210297107696533 seconds PhenoGraph complete in 3.0938730239868164 seconds Found communities [-1, ... 18], with sizes: [75, 742, 538, 519, 510, 420, 287, 182, 154, 150, 149, 143, 86, 84, 82, 77, 72, 52, 27, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.417912244796753 seconds Jaccard graph constructed in 0.5502862930297852 seconds Wrote graph to binary file in 0.2003021240234375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895648 Louvain completed 21 runs in 1.2384800910949707 seconds PhenoGraph complete in 3.423154592514038 seconds Found communities [-1, ... 20], with sizes: [116, 573, 573, 520, 452, 375, 237, 236, 199, 188, 157, 124, 98, 86, 86, 84, 81, 61, 53, 27, 22, 14] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9103193283081055 seconds Jaccard graph constructed in 0.5651466846466064 seconds Wrote graph to binary file in 0.07817745208740234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893139 Louvain completed 21 runs in 1.2251660823822021 seconds PhenoGraph complete in 2.7935497760772705 seconds Found communities [-1, ... 18], with sizes: [96, 559, 549, 509, 491, 348, 257, 233, 211, 188, 154, 141, 127, 89, 87, 85, 80, 79, 53, 26] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7102034091949463 seconds Jaccard graph constructed in 0.5361120700836182 seconds Wrote graph to binary file in 0.19938349723815918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892193 After 5 runs, maximum modularity is Q = 0.89363 Louvain completed 25 runs in 1.562617540359497 seconds PhenoGraph complete in 3.0256247520446777 seconds Found communities [-1, ... 17], with sizes: [103, 543, 531, 495, 451, 355, 344, 280, 243, 197, 196, 131, 91, 86, 84, 80, 74, 52, 26] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0116286277770996 seconds Jaccard graph constructed in 0.5348474979400635 seconds Wrote graph to binary file in 0.18000102043151855 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894301 Louvain completed 21 runs in 1.299785852432251 seconds PhenoGraph complete in 3.0412437915802 seconds Found communities [-1, ... 18], with sizes: [96, 577, 497, 487, 454, 310, 308, 283, 276, 190, 138, 135, 131, 90, 86, 81, 74, 70, 52, 27]
sc.pp.normalize_per_cell(D363_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Nas1) # log transform the data
D363_Brus_Nas1.raw = D363_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D363_Brus_Nas1 = D363_Brus_Nas1[:, D363_Brus_Nas1.var['ribo_genes']]
D363_Brus_Nas1
View of AnnData object with n_obs × n_vars = 3490 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D367_Brus_Nas1 = sc.read_10x_mtx(
'./D367_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Brus_Nas1.var_names_make_unique()
D367_Brus_Nas1.obs['manip'] = 'D367_Brus_Nas1'
D367_Brus_Nas1.obs['position'] = 'Nasal'
D367_Brus_Nas1.obs['method'] = 'Brushing'
D367_Brus_Nas1.obs['donor'] = 'D367'
D367_Brus_Nas1.obs['name'] = ['D367_Brus_Nas1_' + s for s in list(D367_Brus_Nas1.obs.index)]
D367_Brus_Nas1.obs_names = D367_Brus_Nas1.obs['name']
D367_Brus_Nas1
... reading from cache file ./cache/D367_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2596 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D367_Brus_Nas1, n_top=20)
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=0)
mito_genes = D367_Brus_Nas1.var_names.str.startswith('MT-')
D367_Brus_Nas1.obs['percent_mito'] = np.sum(
D367_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.obs['n_counts'] = D367_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Nas1.to_df())
ribo_genes = D367_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Nas1.obs['percent_ribo'] = np.sum(
D367_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=500)
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['n_counts'] < 30000, :]
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['percent_mito'] < 0.5, :]
# scrublet
scrub = scr.Scrublet(D367_Brus_Nas1.X, expected_doublet_rate=0.02)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D367_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.24 Detected doublet rate = 0.3% Estimated detectable doublet fraction = 12.8% Overall doublet rate: Expected = 2.0% Estimated = 2.1% Elapsed time: 2.3 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ec3f50438>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ec3f0bc18>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Brus_Nas1.X).predict()
D367_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5084607601165771 seconds Jaccard graph constructed in 0.4668412208557129 seconds Wrote graph to binary file in 0.1636643409729004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900197 After 4 runs, maximum modularity is Q = 0.902022 Louvain completed 24 runs in 1.3420603275299072 seconds PhenoGraph complete in 2.4956376552581787 seconds Found communities [-1, ... 20], with sizes: [127, 563, 345, 302, 292, 222, 208, 171, 136, 119, 113, 98, 89, 73, 71, 67, 61, 53, 52, 33, 29, 14] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.311171293258667 seconds Jaccard graph constructed in 0.4383265972137451 seconds Wrote graph to binary file in 0.061249732971191406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900962 After 3 runs, maximum modularity is Q = 0.902774 Louvain completed 23 runs in 1.2876784801483154 seconds PhenoGraph complete in 2.1234192848205566 seconds Found communities [-1, ... 21], with sizes: [119, 536, 415, 292, 216, 166, 160, 157, 156, 153, 133, 123, 104, 101, 88, 71, 60, 51, 38, 37, 25, 21, 16] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7086927890777588 seconds Jaccard graph constructed in 0.4393768310546875 seconds Wrote graph to binary file in 0.16389083862304688 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902961 After 2 runs, maximum modularity is Q = 0.905819 Louvain completed 22 runs in 1.2734806537628174 seconds PhenoGraph complete in 2.608114242553711 seconds Found communities [-1, ... 19], with sizes: [149, 583, 344, 314, 264, 252, 160, 150, 139, 132, 124, 103, 103, 93, 75, 64, 63, 51, 32, 26, 17] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3153059482574463 seconds Jaccard graph constructed in 0.5013718605041504 seconds Wrote graph to binary file in 0.05684971809387207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902067 After 2 runs, maximum modularity is Q = 0.903448 Louvain completed 22 runs in 1.2163736820220947 seconds PhenoGraph complete in 2.1063852310180664 seconds Found communities [-1, ... 23], with sizes: [97, 592, 333, 314, 242, 220, 146, 136, 110, 107, 103, 94, 92, 85, 75, 68, 67, 66, 63, 60, 52, 39, 36, 22, 19] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5093197822570801 seconds Jaccard graph constructed in 0.5849921703338623 seconds Wrote graph to binary file in 0.057523488998413086 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901447 After 18 runs, maximum modularity is Q = 0.902698 Louvain completed 38 runs in 1.9057786464691162 seconds PhenoGraph complete in 3.070661783218384 seconds Found communities [-1, ... 19], with sizes: [124, 578, 398, 235, 218, 218, 206, 166, 165, 139, 137, 126, 111, 104, 74, 64, 47, 44, 36, 32, 16] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30880260467529297 seconds Jaccard graph constructed in 0.43804430961608887 seconds Wrote graph to binary file in 0.1864478588104248 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901078 Louvain completed 21 runs in 1.083399772644043 seconds PhenoGraph complete in 2.0399274826049805 seconds Found communities [-1, ... 21], with sizes: [144, 574, 395, 313, 232, 169, 165, 158, 156, 119, 108, 101, 88, 83, 80, 80, 68, 65, 63, 36, 18, 12, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3083302974700928 seconds Jaccard graph constructed in 0.43538928031921387 seconds Wrote graph to binary file in 0.05936908721923828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900565 After 2 runs, maximum modularity is Q = 0.902568 Louvain completed 22 runs in 1.2561678886413574 seconds PhenoGraph complete in 2.083144187927246 seconds Found communities [-1, ... 22], with sizes: [144, 579, 428, 281, 203, 168, 154, 150, 132, 131, 107, 104, 94, 88, 88, 76, 73, 63, 58, 36, 29, 22, 17, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5113711357116699 seconds Jaccard graph constructed in 0.48171424865722656 seconds Wrote graph to binary file in 0.1784346103668213 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902065 Louvain completed 21 runs in 1.0554733276367188 seconds PhenoGraph complete in 2.2406728267669678 seconds Found communities [-1, ... 20], with sizes: [112, 647, 351, 275, 270, 262, 147, 136, 132, 123, 99, 98, 95, 92, 70, 69, 68, 55, 49, 36, 36, 16] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30907249450683594 seconds Jaccard graph constructed in 0.43910741806030273 seconds Wrote graph to binary file in 0.0630044937133789 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900294 After 13 runs, maximum modularity is Q = 0.901576 Louvain completed 33 runs in 1.7589507102966309 seconds PhenoGraph complete in 2.5902557373046875 seconds Found communities [-1, ... 22], with sizes: [140, 592, 316, 290, 269, 238, 204, 168, 134, 106, 106, 100, 92, 91, 71, 61, 56, 48, 39, 39, 34, 17, 15, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31041908264160156 seconds Jaccard graph constructed in 0.5869097709655762 seconds Wrote graph to binary file in 0.05759692192077637 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904244 Louvain completed 21 runs in 1.06337571144104 seconds PhenoGraph complete in 2.0424818992614746 seconds Found communities [-1, ... 19], with sizes: [116, 565, 426, 308, 235, 204, 177, 162, 159, 115, 108, 100, 99, 79, 75, 70, 69, 63, 59, 32, 17] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5087718963623047 seconds Jaccard graph constructed in 0.4409444332122803 seconds Wrote graph to binary file in 0.16645145416259766 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902516 After 2 runs, maximum modularity is Q = 0.905016 Louvain completed 22 runs in 1.2692945003509521 seconds PhenoGraph complete in 2.400050163269043 seconds Found communities [-1, ... 21], with sizes: [133, 574, 392, 288, 249, 247, 168, 158, 157, 113, 111, 101, 87, 77, 73, 66, 61, 53, 37, 33, 28, 18, 14] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31087613105773926 seconds Jaccard graph constructed in 0.47104954719543457 seconds Wrote graph to binary file in 0.05683398246765137 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903375 Louvain completed 21 runs in 1.0595769882202148 seconds PhenoGraph complete in 1.9135053157806396 seconds Found communities [-1, ... 21], with sizes: [132, 645, 328, 320, 263, 161, 154, 147, 130, 123, 121, 113, 106, 90, 87, 69, 66, 59, 36, 28, 25, 20, 15] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.714332103729248 seconds Jaccard graph constructed in 0.49078965187072754 seconds Wrote graph to binary file in 0.17988967895507812 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902795 After 4 runs, maximum modularity is Q = 0.90413 After 15 runs, maximum modularity is Q = 0.905254 Louvain completed 35 runs in 1.9082610607147217 seconds PhenoGraph complete in 3.309579849243164 seconds Found communities [-1, ... 21], with sizes: [155, 619, 363, 331, 220, 198, 182, 165, 130, 112, 102, 86, 85, 85, 70, 70, 64, 52, 51, 35, 32, 16, 15] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.608839750289917 seconds Jaccard graph constructed in 0.4503631591796875 seconds Wrote graph to binary file in 0.06904840469360352 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901071 After 2 runs, maximum modularity is Q = 0.902185 Louvain completed 22 runs in 1.2976205348968506 seconds PhenoGraph complete in 2.4437458515167236 seconds Found communities [-1, ... 18], with sizes: [155, 591, 341, 302, 299, 276, 164, 150, 146, 134, 116, 98, 94, 80, 69, 68, 66, 40, 33, 16] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4082968235015869 seconds Jaccard graph constructed in 0.4441239833831787 seconds Wrote graph to binary file in 0.15816211700439453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899355 After 5 runs, maximum modularity is Q = 0.90109 Louvain completed 25 runs in 1.411998987197876 seconds PhenoGraph complete in 2.4448084831237793 seconds Found communities [-1, ... 20], with sizes: [134, 594, 391, 281, 230, 176, 158, 154, 147, 134, 127, 125, 103, 99, 90, 70, 68, 54, 37, 34, 17, 15] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5099594593048096 seconds Jaccard graph constructed in 0.5047998428344727 seconds Wrote graph to binary file in 0.18423247337341309 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905307 Louvain completed 21 runs in 1.0799944400787354 seconds PhenoGraph complete in 2.2988812923431396 seconds Found communities [-1, ... 24], with sizes: [101, 602, 425, 282, 181, 172, 161, 145, 142, 127, 105, 102, 87, 86, 81, 69, 67, 63, 45, 43, 37, 33, 26, 25, 17, 14] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5108683109283447 seconds Jaccard graph constructed in 0.4646470546722412 seconds Wrote graph to binary file in 0.05773448944091797 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901732 Louvain completed 21 runs in 1.0860049724578857 seconds PhenoGraph complete in 2.150665283203125 seconds Found communities [-1, ... 21], with sizes: [115, 595, 336, 293, 242, 181, 154, 150, 134, 133, 128, 110, 106, 98, 92, 87, 69, 63, 41, 41, 37, 17, 16] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5178375244140625 seconds Jaccard graph constructed in 0.4842367172241211 seconds Wrote graph to binary file in 0.18054437637329102 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900731 Louvain completed 21 runs in 1.0764310359954834 seconds PhenoGraph complete in 2.2772862911224365 seconds Found communities [-1, ... 22], with sizes: [122, 608, 363, 279, 250, 238, 155, 135, 125, 124, 105, 104, 90, 87, 75, 69, 68, 68, 54, 36, 32, 18, 18, 15] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5122959613800049 seconds Jaccard graph constructed in 0.43636131286621094 seconds Wrote graph to binary file in 0.05808568000793457 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902472 After 3 runs, maximum modularity is Q = 0.903498 Louvain completed 23 runs in 1.2720496654510498 seconds PhenoGraph complete in 2.3114070892333984 seconds Found communities [-1, ... 22], with sizes: [112, 564, 298, 281, 271, 206, 184, 165, 150, 142, 113, 100, 99, 99, 96, 71, 60, 57, 44, 38, 33, 25, 17, 13] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5092461109161377 seconds Jaccard graph constructed in 0.4480714797973633 seconds Wrote graph to binary file in 0.16853785514831543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903368 Louvain completed 21 runs in 1.1005442142486572 seconds PhenoGraph complete in 2.2482662200927734 seconds Found communities [-1, ... 21], with sizes: [112, 556, 332, 318, 249, 237, 223, 164, 159, 118, 108, 101, 89, 72, 70, 63, 61, 57, 46, 33, 32, 20, 18] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5079901218414307 seconds Jaccard graph constructed in 0.43482112884521484 seconds Wrote graph to binary file in 0.16631746292114258 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90279 After 2 runs, maximum modularity is Q = 0.904373 Louvain completed 22 runs in 1.2382500171661377 seconds PhenoGraph complete in 2.3846590518951416 seconds Found communities [-1, ... 21], with sizes: [145, 595, 409, 327, 202, 202, 200, 135, 135, 132, 108, 99, 99, 83, 79, 70, 62, 39, 35, 31, 23, 16, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5132453441619873 seconds Jaccard graph constructed in 0.45227956771850586 seconds Wrote graph to binary file in 0.05643153190612793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899397 After 2 runs, maximum modularity is Q = 0.901089 Louvain completed 22 runs in 1.2974295616149902 seconds PhenoGraph complete in 2.3314359188079834 seconds Found communities [-1, ... 22], with sizes: [119, 630, 401, 307, 254, 179, 163, 162, 132, 125, 109, 106, 93, 89, 75, 63, 56, 42, 36, 30, 20, 18, 17, 12] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5112402439117432 seconds Jaccard graph constructed in 0.43253445625305176 seconds Wrote graph to binary file in 0.20474982261657715 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903668 Louvain completed 21 runs in 1.0554800033569336 seconds PhenoGraph complete in 2.235367774963379 seconds Found communities [-1, ... 20], with sizes: [114, 590, 353, 314, 285, 248, 163, 162, 138, 114, 103, 92, 85, 83, 69, 68, 67, 59, 49, 40, 25, 17] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5084431171417236 seconds Jaccard graph constructed in 0.4371986389160156 seconds Wrote graph to binary file in 0.06174921989440918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899561 After 5 runs, maximum modularity is Q = 0.901489 Louvain completed 25 runs in 1.4370558261871338 seconds PhenoGraph complete in 2.460622549057007 seconds Found communities [-1, ... 22], with sizes: [133, 533, 316, 307, 289, 190, 183, 149, 137, 136, 107, 105, 104, 94, 73, 71, 61, 55, 50, 48, 32, 30, 19, 16] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5137088298797607 seconds Jaccard graph constructed in 0.45734453201293945 seconds Wrote graph to binary file in 0.1864638328552246 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902882 Louvain completed 21 runs in 1.0718755722045898 seconds PhenoGraph complete in 2.2464241981506348 seconds Found communities [-1, ... 22], with sizes: [153, 569, 319, 277, 254, 209, 174, 172, 134, 133, 114, 104, 91, 89, 81, 70, 66, 66, 38, 33, 33, 32, 16, 11]
sc.pp.normalize_per_cell(D367_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Nas1) # log transform the data
D367_Brus_Nas1.raw = D367_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D367_Brus_Nas1 = D367_Brus_Nas1[:, D367_Brus_Nas1.var['ribo_genes']]
D367_Brus_Nas1
View of AnnData object with n_obs × n_vars = 2591 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D372_Brus_Nas1 = sc.read_10x_mtx(
'./D372_Brus_Nas1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Brus_Nas1.var_names_make_unique()
D372_Brus_Nas1.obs['manip'] = 'D372_Brus_Nas1'
D372_Brus_Nas1.obs['position'] = 'Nasal'
D372_Brus_Nas1.obs['method'] = 'Brushing'
D372_Brus_Nas1.obs['donor'] = 'D372'
D372_Brus_Nas1.obs['name'] = ['D372_Brus_Nas1_' + s for s in list(D372_Brus_Nas1.obs.index)]
D372_Brus_Nas1.obs_names = D372_Brus_Nas1.obs['name']
D372_Brus_Nas1
... reading from cache file ./cache/D372_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2336 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D372_Brus_Nas1, n_top=20)
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=0)
mito_genes = D372_Brus_Nas1.var_names.str.startswith('MT-')
D372_Brus_Nas1.obs['percent_mito'] = np.sum(
D372_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.obs['n_counts'] = D372_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Nas1.to_df())
ribo_genes = D372_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Nas1.obs['percent_ribo'] = np.sum(
D372_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=500)
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['n_counts'] < 40000, :]
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['percent_mito'] < 0.5, :]
# scrublet
scrub = scr.Scrublet(D372_Brus_Nas1.X, expected_doublet_rate=0.019)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D372_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.22 Detected doublet rate = 0.3% Estimated detectable doublet fraction = 15.0% Overall doublet rate: Expected = 1.9% Estimated = 1.7% Elapsed time: 2.1 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eca849a20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecba29fd0>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Brus_Nas1.X).predict()
D372_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4104349613189697 seconds Jaccard graph constructed in 0.48456406593322754 seconds Wrote graph to binary file in 0.05161619186401367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902756 Louvain completed 21 runs in 1.038480281829834 seconds PhenoGraph complete in 1.9975333213806152 seconds Found communities [-1, ... 21], with sizes: [93, 457, 316, 305, 289, 288, 164, 146, 128, 114, 95, 67, 66, 65, 56, 50, 49, 35, 34, 34, 22, 21, 17] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4088413715362549 seconds Jaccard graph constructed in 0.4397096633911133 seconds Wrote graph to binary file in 0.16575980186462402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905068 Louvain completed 21 runs in 1.0620262622833252 seconds PhenoGraph complete in 2.0957629680633545 seconds Found communities [-1, ... 20], with sizes: [91, 418, 315, 304, 291, 252, 182, 174, 173, 141, 74, 73, 72, 68, 64, 53, 39, 38, 36, 22, 19, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41448283195495605 seconds Jaccard graph constructed in 0.4419240951538086 seconds Wrote graph to binary file in 0.054010629653930664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904481 Louvain completed 21 runs in 1.05592942237854 seconds PhenoGraph complete in 1.9787116050720215 seconds Found communities [-1, ... 21], with sizes: [100, 403, 329, 288, 245, 217, 211, 185, 152, 121, 110, 80, 74, 69, 68, 56, 47, 39, 35, 29, 22, 18, 13] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4087090492248535 seconds Jaccard graph constructed in 0.43219971656799316 seconds Wrote graph to binary file in 0.18044114112854004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899057 Louvain completed 21 runs in 1.039236307144165 seconds PhenoGraph complete in 2.0733461380004883 seconds Found communities [-1, ... 20], with sizes: [80, 406, 341, 320, 292, 236, 176, 153, 152, 139, 90, 76, 75, 71, 55, 52, 51, 44, 34, 29, 23, 16] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4097177982330322 seconds Jaccard graph constructed in 0.4422883987426758 seconds Wrote graph to binary file in 0.05698728561401367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900724 Louvain completed 21 runs in 1.0721895694732666 seconds PhenoGraph complete in 2.001919984817505 seconds Found communities [-1, ... 23], with sizes: [88, 370, 309, 222, 195, 195, 193, 171, 146, 139, 125, 114, 88, 80, 78, 73, 64, 55, 47, 44, 34, 33, 22, 13, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41097593307495117 seconds Jaccard graph constructed in 0.5679218769073486 seconds Wrote graph to binary file in 0.05442643165588379 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900366 After 4 runs, maximum modularity is Q = 0.901533 Louvain completed 24 runs in 1.3656713962554932 seconds PhenoGraph complete in 2.410922050476074 seconds Found communities [-1, ... 22], with sizes: [95, 396, 333, 298, 253, 250, 176, 144, 120, 89, 85, 85, 77, 74, 71, 66, 56, 52, 42, 42, 34, 34, 22, 17] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5100057125091553 seconds Jaccard graph constructed in 0.4215846061706543 seconds Wrote graph to binary file in 0.1642932891845703 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902812 Louvain completed 21 runs in 1.056971788406372 seconds PhenoGraph complete in 2.165310859680176 seconds Found communities [-1, ... 21], with sizes: [97, 395, 372, 269, 255, 251, 242, 165, 124, 122, 87, 82, 80, 63, 62, 50, 37, 36, 34, 33, 22, 20, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4157731533050537 seconds Jaccard graph constructed in 0.44240331649780273 seconds Wrote graph to binary file in 0.05346488952636719 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904502 Louvain completed 21 runs in 1.0716676712036133 seconds PhenoGraph complete in 1.9949562549591064 seconds Found communities [-1, ... 21], with sizes: [74, 398, 330, 280, 277, 257, 197, 179, 172, 144, 81, 76, 74, 70, 59, 50, 42, 37, 34, 29, 23, 16, 12] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4089932441711426 seconds Jaccard graph constructed in 0.4276108741760254 seconds Wrote graph to binary file in 0.18631744384765625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899931 After 4 runs, maximum modularity is Q = 0.901282 Louvain completed 24 runs in 1.332381010055542 seconds PhenoGraph complete in 2.370694398880005 seconds Found communities [-1, ... 23], with sizes: [85, 382, 245, 243, 237, 188, 183, 156, 147, 123, 122, 117, 96, 80, 76, 72, 66, 56, 51, 48, 35, 34, 33, 22, 14] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40883970260620117 seconds Jaccard graph constructed in 0.4815654754638672 seconds Wrote graph to binary file in 0.05580449104309082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905739 Louvain completed 21 runs in 1.0528161525726318 seconds PhenoGraph complete in 2.0144546031951904 seconds Found communities [-1, ... 20], with sizes: [89, 407, 327, 298, 245, 221, 187, 168, 150, 142, 111, 82, 77, 73, 72, 55, 54, 41, 37, 35, 22, 18] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4092233180999756 seconds Jaccard graph constructed in 0.43491411209106445 seconds Wrote graph to binary file in 0.1816234588623047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902347 Louvain completed 21 runs in 1.0661087036132812 seconds PhenoGraph complete in 2.1032302379608154 seconds Found communities [-1, ... 21], with sizes: [85, 413, 385, 294, 292, 275, 195, 180, 159, 81, 76, 68, 68, 53, 52, 43, 41, 35, 34, 29, 22, 19, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4102473258972168 seconds Jaccard graph constructed in 0.4155423641204834 seconds Wrote graph to binary file in 0.056139469146728516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90426 Louvain completed 21 runs in 1.0423285961151123 seconds PhenoGraph complete in 1.9398508071899414 seconds Found communities [-1, ... 22], with sizes: [94, 426, 366, 282, 246, 211, 206, 191, 144, 120, 84, 74, 73, 72, 65, 46, 40, 38, 36, 30, 22, 18, 16, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4085385799407959 seconds Jaccard graph constructed in 0.4093973636627197 seconds Wrote graph to binary file in 0.16074681282043457 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902218 Louvain completed 21 runs in 1.0713722705841064 seconds PhenoGraph complete in 2.0610687732696533 seconds Found communities [-1, ... 22], with sizes: [94, 425, 304, 281, 273, 236, 182, 171, 150, 142, 112, 73, 68, 67, 65, 53, 42, 35, 35, 34, 22, 18, 15, 14] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4122939109802246 seconds Jaccard graph constructed in 0.43650269508361816 seconds Wrote graph to binary file in 0.058606863021850586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900811 After 5 runs, maximum modularity is Q = 0.901957 Louvain completed 25 runs in 1.380363941192627 seconds PhenoGraph complete in 2.306610107421875 seconds Found communities [-1, ... 21], with sizes: [107, 413, 315, 298, 296, 269, 221, 184, 117, 96, 86, 76, 64, 62, 54, 49, 48, 38, 34, 34, 22, 15, 13] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40839338302612305 seconds Jaccard graph constructed in 0.4399690628051758 seconds Wrote graph to binary file in 0.18412351608276367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900966 After 2 runs, maximum modularity is Q = 0.902166 Louvain completed 22 runs in 1.2874994277954102 seconds PhenoGraph complete in 2.334472894668579 seconds Found communities [-1, ... 22], with sizes: [108, 417, 303, 295, 245, 231, 225, 206, 172, 106, 75, 69, 66, 55, 54, 50, 46, 44, 34, 29, 27, 22, 17, 15] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.419569730758667 seconds Jaccard graph constructed in 0.43588876724243164 seconds Wrote graph to binary file in 0.05357718467712402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902249 Louvain completed 21 runs in 1.0449965000152588 seconds PhenoGraph complete in 1.974944829940796 seconds Found communities [-1, ... 20], with sizes: [105, 415, 315, 301, 277, 274, 190, 184, 117, 98, 88, 86, 79, 72, 62, 58, 42, 40, 36, 33, 22, 17] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40757298469543457 seconds Jaccard graph constructed in 0.4286377429962158 seconds Wrote graph to binary file in 0.18510174751281738 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899864 After 2 runs, maximum modularity is Q = 0.901044 Louvain completed 22 runs in 1.2612056732177734 seconds PhenoGraph complete in 2.300072193145752 seconds Found communities [-1, ... 21], with sizes: [101, 395, 370, 277, 275, 234, 205, 151, 139, 134, 79, 78, 76, 72, 57, 56, 56, 41, 35, 28, 22, 18, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5133953094482422 seconds Jaccard graph constructed in 0.43563127517700195 seconds Wrote graph to binary file in 0.06734108924865723 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901219 Louvain completed 21 runs in 1.0888986587524414 seconds PhenoGraph complete in 2.133082389831543 seconds Found communities [-1, ... 19], with sizes: [83, 424, 353, 293, 292, 284, 202, 186, 154, 82, 78, 74, 71, 68, 64, 51, 47, 35, 31, 22, 17] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.414459228515625 seconds Jaccard graph constructed in 0.44033050537109375 seconds Wrote graph to binary file in 0.17866730690002441 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899097 After 2 runs, maximum modularity is Q = 0.900133 Louvain completed 22 runs in 1.2436597347259521 seconds PhenoGraph complete in 2.2888524532318115 seconds Found communities [-1, ... 23], with sizes: [108, 313, 307, 284, 259, 253, 189, 163, 126, 112, 109, 92, 78, 75, 70, 69, 65, 47, 39, 37, 34, 34, 22, 15, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4095485210418701 seconds Jaccard graph constructed in 0.49824070930480957 seconds Wrote graph to binary file in 0.05281829833984375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902287 Louvain completed 21 runs in 1.0733473300933838 seconds PhenoGraph complete in 2.0504233837127686 seconds Found communities [-1, ... 20], with sizes: [89, 423, 307, 302, 211, 205, 202, 175, 155, 136, 114, 88, 81, 80, 72, 67, 52, 45, 34, 33, 22, 18] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40773892402648926 seconds Jaccard graph constructed in 0.44121623039245605 seconds Wrote graph to binary file in 0.18178534507751465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902912 Louvain completed 21 runs in 1.0678446292877197 seconds PhenoGraph complete in 2.1116764545440674 seconds Found communities [-1, ... 20], with sizes: [86, 364, 358, 287, 272, 261, 240, 192, 146, 132, 87, 86, 76, 73, 70, 43, 34, 30, 27, 22, 14, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40897274017333984 seconds Jaccard graph constructed in 0.43024516105651855 seconds Wrote graph to binary file in 0.05281400680541992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907184 Louvain completed 21 runs in 1.0659265518188477 seconds PhenoGraph complete in 1.9704034328460693 seconds Found communities [-1, ... 22], with sizes: [121, 386, 314, 290, 281, 253, 220, 161, 141, 89, 78, 76, 70, 66, 64, 57, 47, 44, 39, 34, 28, 22, 17, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40806055068969727 seconds Jaccard graph constructed in 0.43677735328674316 seconds Wrote graph to binary file in 0.15897130966186523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902582 After 13 runs, maximum modularity is Q = 0.904015 Louvain completed 33 runs in 1.7000999450683594 seconds PhenoGraph complete in 2.715348720550537 seconds Found communities [-1, ... 21], with sizes: [108, 358, 318, 311, 289, 242, 190, 165, 141, 117, 96, 79, 75, 70, 65, 58, 47, 44, 36, 32, 31, 22, 17] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41316866874694824 seconds Jaccard graph constructed in 0.4379744529724121 seconds Wrote graph to binary file in 0.18474483489990234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899323 After 2 runs, maximum modularity is Q = 0.900924 Louvain completed 22 runs in 1.2787697315216064 seconds PhenoGraph complete in 2.327200174331665 seconds Found communities [-1, ... 22], with sizes: [100, 385, 313, 256, 242, 182, 163, 152, 151, 151, 151, 90, 86, 72, 68, 64, 55, 50, 41, 35, 33, 32, 23, 16] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.408933162689209 seconds Jaccard graph constructed in 0.42848753929138184 seconds Wrote graph to binary file in 0.050803184509277344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899611 Louvain completed 21 runs in 1.0532317161560059 seconds PhenoGraph complete in 1.952439546585083 seconds Found communities [-1, ... 21], with sizes: [103, 410, 372, 254, 229, 225, 199, 159, 137, 135, 106, 79, 79, 78, 70, 54, 49, 37, 37, 35, 27, 23, 14]
sc.pp.normalize_per_cell(D372_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Nas1) # log transform the data
D372_Brus_Nas1.raw = D372_Brus_Nas1 # freeze the object (for later use of the raw state of it)
D372_Brus_Nas1 = D372_Brus_Nas1[:, D372_Brus_Nas1.var['ribo_genes']]
D372_Brus_Nas1
View of AnnData object with n_obs × n_vars = 2329 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D322_Biop_Pro1 = sc.read_10x_mtx(
'./D322_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Pro1.var_names_make_unique()
D322_Biop_Pro1.obs['manip'] = 'D322_Biop_Pro1'
D322_Biop_Pro1.obs['position'] = 'Proximal'
D322_Biop_Pro1.obs['method'] = 'Biopsy'
D322_Biop_Pro1.obs['donor'] = 'D322'
D322_Biop_Pro1.obs['name'] = ['D322_Biop_Pro1_' + s for s in list(D322_Biop_Pro1.obs.index)]
D322_Biop_Pro1.obs_names = D322_Biop_Pro1.obs['name']
D322_Biop_Pro1
... reading from cache file ./cache/D322_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2035 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D322_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=0)
mito_genes = D322_Biop_Pro1.var_names.str.startswith('MT-')
D322_Biop_Pro1.obs['percent_mito'] = np.sum(
D322_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.obs['n_counts'] = D322_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Pro1.to_df())
ribo_genes = D322_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Pro1.obs['percent_ribo'] = np.sum(
D322_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=500)
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['n_counts'] < 20000, :]
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['percent_mito'] < 0.3, :]
filtered out 60 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D322_Biop_Pro1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D322_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.18 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 9.6% Overall doublet rate: Expected = 1.6% Estimated = 4.3% Elapsed time: 1.0 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfe203c8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfaf2828>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Pro1.X).predict()
D322_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6393563747406006 seconds Jaccard graph constructed in 0.4113748073577881 seconds Wrote graph to binary file in 0.0392765998840332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900925 Louvain completed 21 runs in 1.0455265045166016 seconds PhenoGraph complete in 2.1460633277893066 seconds Found communities [-1, ... 18], with sizes: [140, 811, 395, 198, 155, 147, 92, 74, 68, 53, 48, 42, 38, 38, 35, 32, 28, 24, 22, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4117100238800049 seconds Jaccard graph constructed in 0.410874605178833 seconds Wrote graph to binary file in 0.03756237030029297 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899296 After 3 runs, maximum modularity is Q = 0.900804 Louvain completed 23 runs in 1.3342885971069336 seconds PhenoGraph complete in 2.2148613929748535 seconds Found communities [-1, ... 19], with sizes: [205, 769, 398, 178, 142, 140, 76, 74, 74, 69, 47, 42, 39, 35, 35, 28, 25, 25, 24, 14, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40962743759155273 seconds Jaccard graph constructed in 0.4233078956604004 seconds Wrote graph to binary file in 0.17347097396850586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900464 Louvain completed 21 runs in 1.0541834831237793 seconds PhenoGraph complete in 2.0714707374572754 seconds Found communities [-1, ... 18], with sizes: [163, 778, 371, 217, 165, 160, 100, 77, 72, 63, 42, 42, 42, 34, 27, 26, 22, 21, 17, 13] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4098362922668457 seconds Jaccard graph constructed in 0.4011504650115967 seconds Wrote graph to binary file in 0.03797626495361328 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899684 Louvain completed 21 runs in 1.0307955741882324 seconds PhenoGraph complete in 1.8917357921600342 seconds Found communities [-1, ... 18], with sizes: [192, 703, 370, 237, 202, 136, 96, 83, 80, 55, 44, 42, 33, 33, 33, 27, 25, 25, 20, 16] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4088118076324463 seconds Jaccard graph constructed in 0.45768094062805176 seconds Wrote graph to binary file in 0.039757728576660156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901873 Louvain completed 21 runs in 1.0425488948822021 seconds PhenoGraph complete in 1.9624834060668945 seconds Found communities [-1, ... 18], with sizes: [185, 764, 398, 199, 165, 139, 91, 82, 68, 56, 43, 41, 40, 37, 33, 33, 25, 22, 17, 14] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40851902961730957 seconds Jaccard graph constructed in 0.39827442169189453 seconds Wrote graph to binary file in 0.1715395450592041 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896473 After 2 runs, maximum modularity is Q = 0.897808 Louvain completed 22 runs in 1.255896806716919 seconds PhenoGraph complete in 2.24523663520813 seconds Found communities [-1, ... 18], with sizes: [134, 818, 349, 193, 192, 141, 98, 89, 74, 64, 47, 43, 39, 35, 29, 27, 25, 22, 21, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.408618688583374 seconds Jaccard graph constructed in 0.4059271812438965 seconds Wrote graph to binary file in 0.04202532768249512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897849 After 2 runs, maximum modularity is Q = 0.900395 Louvain completed 22 runs in 1.311697006225586 seconds PhenoGraph complete in 2.18277645111084 seconds Found communities [-1, ... 19], with sizes: [139, 797, 390, 181, 176, 173, 80, 72, 71, 67, 44, 43, 38, 35, 30, 26, 23, 22, 17, 15, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40802645683288574 seconds Jaccard graph constructed in 0.4162874221801758 seconds Wrote graph to binary file in 0.03611612319946289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902588 Louvain completed 21 runs in 1.050546407699585 seconds PhenoGraph complete in 1.9213097095489502 seconds Found communities [-1, ... 19], with sizes: [201, 791, 289, 282, 184, 141, 91, 67, 67, 58, 38, 36, 35, 31, 30, 26, 25, 21, 14, 14, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40918421745300293 seconds Jaccard graph constructed in 0.40732789039611816 seconds Wrote graph to binary file in 0.1727921962738037 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898019 Louvain completed 21 runs in 1.068749189376831 seconds PhenoGraph complete in 2.0713589191436768 seconds Found communities [-1, ... 19], with sizes: [171, 794, 396, 168, 164, 136, 96, 80, 69, 63, 54, 45, 36, 33, 30, 26, 25, 22, 20, 13, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4082217216491699 seconds Jaccard graph constructed in 0.39855480194091797 seconds Wrote graph to binary file in 0.03763461112976074 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898898 Louvain completed 21 runs in 1.0382418632507324 seconds PhenoGraph complete in 1.8954129219055176 seconds Found communities [-1, ... 19], with sizes: [170, 755, 296, 217, 178, 138, 92, 90, 76, 67, 66, 52, 46, 35, 33, 32, 31, 29, 21, 16, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4072906970977783 seconds Jaccard graph constructed in 0.406890869140625 seconds Wrote graph to binary file in 0.03890848159790039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89867 After 5 runs, maximum modularity is Q = 0.899724 Louvain completed 25 runs in 1.3519103527069092 seconds PhenoGraph complete in 2.2166450023651123 seconds Found communities [-1, ... 18], with sizes: [152, 770, 398, 190, 179, 140, 76, 71, 69, 65, 59, 46, 44, 42, 37, 35, 27, 23, 16, 13] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4082176685333252 seconds Jaccard graph constructed in 0.42229700088500977 seconds Wrote graph to binary file in 0.15726685523986816 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902191 Louvain completed 21 runs in 1.0502548217773438 seconds PhenoGraph complete in 2.049294948577881 seconds Found communities [-1, ... 18], with sizes: [165, 758, 363, 240, 180, 130, 75, 74, 69, 55, 54, 46, 44, 41, 38, 27, 27, 25, 22, 19] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4109930992126465 seconds Jaccard graph constructed in 0.41216397285461426 seconds Wrote graph to binary file in 0.037985801696777344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900734 Louvain completed 21 runs in 1.0605406761169434 seconds PhenoGraph complete in 1.9386405944824219 seconds Found communities [-1, ... 18], with sizes: [180, 740, 298, 264, 196, 143, 90, 81, 81, 61, 53, 50, 41, 35, 33, 32, 27, 17, 16, 14] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40829038619995117 seconds Jaccard graph constructed in 0.42495107650756836 seconds Wrote graph to binary file in 0.037286996841430664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897438 Louvain completed 21 runs in 1.117274522781372 seconds PhenoGraph complete in 1.999619960784912 seconds Found communities [-1, ... 17], with sizes: [183, 790, 379, 180, 173, 162, 83, 76, 72, 72, 55, 46, 44, 32, 32, 24, 24, 14, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41057324409484863 seconds Jaccard graph constructed in 0.4098360538482666 seconds Wrote graph to binary file in 0.1486949920654297 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898678 After 3 runs, maximum modularity is Q = 0.899945 Louvain completed 23 runs in 1.284012794494629 seconds PhenoGraph complete in 2.2636094093322754 seconds Found communities [-1, ... 20], with sizes: [189, 762, 392, 196, 169, 132, 89, 84, 66, 63, 46, 35, 33, 30, 29, 27, 25, 23, 18, 17, 14, 13] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4084312915802002 seconds Jaccard graph constructed in 0.4162874221801758 seconds Wrote graph to binary file in 0.03622627258300781 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900493 Louvain completed 21 runs in 1.05033540725708 seconds PhenoGraph complete in 1.923551082611084 seconds Found communities [-1, ... 20], with sizes: [171, 791, 352, 196, 166, 140, 88, 72, 68, 62, 56, 42, 36, 34, 29, 28, 25, 23, 22, 18, 18, 15] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40816807746887207 seconds Jaccard graph constructed in 0.4208359718322754 seconds Wrote graph to binary file in 0.03706812858581543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90044 After 3 runs, maximum modularity is Q = 0.901495 Louvain completed 23 runs in 1.2756478786468506 seconds PhenoGraph complete in 2.1544413566589355 seconds Found communities [-1, ... 19], with sizes: [149, 812, 405, 179, 160, 139, 93, 81, 69, 54, 46, 42, 39, 34, 33, 29, 23, 21, 17, 15, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40907955169677734 seconds Jaccard graph constructed in 0.411531925201416 seconds Wrote graph to binary file in 0.15400314331054688 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90043 Louvain completed 21 runs in 1.0795376300811768 seconds PhenoGraph complete in 2.066520929336548 seconds Found communities [-1, ... 20], with sizes: [195, 779, 395, 168, 136, 132, 83, 82, 78, 69, 45, 41, 36, 35, 31, 31, 30, 24, 18, 16, 14, 14] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40826964378356934 seconds Jaccard graph constructed in 0.41058945655822754 seconds Wrote graph to binary file in 0.03816676139831543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89756 Louvain completed 21 runs in 1.0991933345794678 seconds PhenoGraph complete in 1.9706928730010986 seconds Found communities [-1, ... 18], with sizes: [145, 802, 385, 193, 182, 135, 80, 80, 68, 68, 54, 46, 37, 35, 32, 31, 26, 23, 18, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40864086151123047 seconds Jaccard graph constructed in 0.41047143936157227 seconds Wrote graph to binary file in 0.036835670471191406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899154 Louvain completed 21 runs in 1.0300240516662598 seconds PhenoGraph complete in 1.898136854171753 seconds Found communities [-1, ... 18], with sizes: [146, 772, 404, 197, 167, 142, 94, 81, 57, 57, 49, 44, 42, 37, 36, 33, 33, 28, 22, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40981054306030273 seconds Jaccard graph constructed in 0.4129366874694824 seconds Wrote graph to binary file in 0.15970945358276367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897327 After 2 runs, maximum modularity is Q = 0.89844 Louvain completed 22 runs in 1.269575595855713 seconds PhenoGraph complete in 2.264087438583374 seconds Found communities [-1, ... 19], with sizes: [169, 725, 391, 234, 197, 134, 78, 67, 66, 52, 52, 42, 38, 38, 35, 30, 28, 23, 23, 18, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4084758758544922 seconds Jaccard graph constructed in 0.4285869598388672 seconds Wrote graph to binary file in 0.037267446517944336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895792 Louvain completed 21 runs in 1.0494191646575928 seconds PhenoGraph complete in 1.937840223312378 seconds Found communities [-1, ... 19], with sizes: [170, 753, 372, 196, 184, 140, 99, 88, 77, 68, 38, 38, 37, 36, 26, 26, 23, 23, 23, 22, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4080162048339844 seconds Jaccard graph constructed in 0.41349220275878906 seconds Wrote graph to binary file in 0.03620028495788574 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898052 Louvain completed 21 runs in 1.0701885223388672 seconds PhenoGraph complete in 1.9381113052368164 seconds Found communities [-1, ... 20], with sizes: [182, 800, 303, 198, 155, 140, 93, 69, 69, 67, 63, 46, 37, 36, 34, 34, 33, 27, 22, 17, 15, 12] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40899109840393066 seconds Jaccard graph constructed in 0.3948996067047119 seconds Wrote graph to binary file in 0.16979265213012695 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896895 Louvain completed 21 runs in 1.024454116821289 seconds PhenoGraph complete in 2.008869171142578 seconds Found communities [-1, ... 16], with sizes: [173, 754, 344, 222, 190, 160, 82, 74, 72, 70, 61, 57, 44, 42, 36, 32, 23, 16] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40865564346313477 seconds Jaccard graph constructed in 0.4541025161743164 seconds Wrote graph to binary file in 0.04162740707397461 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900106 Louvain completed 21 runs in 1.0664453506469727 seconds PhenoGraph complete in 1.9894070625305176 seconds Found communities [-1, ... 20], with sizes: [142, 797, 368, 196, 185, 137, 84, 69, 68, 64, 46, 41, 37, 34, 34, 32, 26, 24, 23, 18, 16, 11]
sc.pp.normalize_per_cell(D322_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Pro1) # log transform the data
D322_Biop_Pro1.raw = D322_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D322_Biop_Pro1 = D322_Biop_Pro1[:, D322_Biop_Pro1.var['ribo_genes']]
D322_Biop_Pro1
View of AnnData object with n_obs × n_vars = 1962 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D326_Biop_Pro1 = sc.read_10x_mtx(
'./D326_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Biop_Pro1.var_names_make_unique()
D326_Biop_Pro1.obs['manip'] = 'D326_Biop_Pro1'
D326_Biop_Pro1.obs['position'] = 'Proximal'
D326_Biop_Pro1.obs['method'] = 'Biopsy'
D326_Biop_Pro1.obs['donor'] = 'D326'
D326_Biop_Pro1.obs['name'] = ['D326_Biop_Pro1_' + s for s in list(D326_Biop_Pro1.obs.index)]
D326_Biop_Pro1.obs_names = D326_Biop_Pro1.obs['name']
D326_Biop_Pro1
... reading from cache file ./cache/D326_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2941 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D326_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=0)
mito_genes = D326_Biop_Pro1.var_names.str.startswith('MT-')
D326_Biop_Pro1.obs['percent_mito'] = np.sum(
D326_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.obs['n_counts'] = D326_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Pro1.to_df())
ribo_genes = D326_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Pro1.obs['percent_ribo'] = np.sum(
D326_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=500)
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['n_counts'] < 40000, :]
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['percent_mito'] < 0.5, :]
filtered out 12 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D326_Biop_Pro1.X, expected_doublet_rate=0.023)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D326_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.27 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 22.9% Overall doublet rate: Expected = 2.3% Estimated = 1.9% Elapsed time: 2.3 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eca851ba8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebf9879b0>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Biop_Pro1.X).predict()
D326_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.508995771408081 seconds Jaccard graph constructed in 0.6974852085113525 seconds Wrote graph to binary file in 0.05855703353881836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91892 Louvain completed 21 runs in 1.257519006729126 seconds PhenoGraph complete in 2.546231269836426 seconds Found communities [-1, ... 23], with sizes: [242, 1016, 445, 243, 205, 192, 166, 152, 133, 121, 115, 84, 79, 73, 67, 65, 46, 45, 43, 26, 22, 22, 16, 15, 14] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6098105907440186 seconds Jaccard graph constructed in 0.539586067199707 seconds Wrote graph to binary file in 0.2025914192199707 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919851 Louvain completed 21 runs in 1.2839746475219727 seconds PhenoGraph complete in 2.658162832260132 seconds Found communities [-1, ... 27], with sizes: [236, 888, 367, 260, 205, 192, 192, 169, 169, 103, 87, 83, 83, 78, 73, 61, 57, 46, 44, 40, 40, 32, 29, 28, 21, 21, 16, 14, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6093878746032715 seconds Jaccard graph constructed in 0.5530076026916504 seconds Wrote graph to binary file in 0.059752702713012695 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917163 Louvain completed 21 runs in 1.249143123626709 seconds PhenoGraph complete in 2.4859230518341064 seconds Found communities [-1, ... 24], with sizes: [233, 1118, 396, 230, 222, 175, 147, 140, 126, 122, 120, 86, 73, 68, 65, 56, 46, 43, 42, 29, 27, 23, 17, 15, 14, 14] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5099210739135742 seconds Jaccard graph constructed in 0.5796449184417725 seconds Wrote graph to binary file in 0.1806011199951172 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917036 Louvain completed 21 runs in 1.261793613433838 seconds PhenoGraph complete in 2.54829740524292 seconds Found communities [-1, ... 23], with sizes: [233, 1133, 357, 233, 210, 188, 166, 159, 118, 116, 85, 83, 77, 71, 69, 66, 50, 44, 42, 41, 35, 23, 22, 13, 13] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5111558437347412 seconds Jaccard graph constructed in 0.6220264434814453 seconds Wrote graph to binary file in 0.06535983085632324 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917812 After 2 runs, maximum modularity is Q = 0.918871 Louvain completed 22 runs in 1.5968191623687744 seconds PhenoGraph complete in 2.814903974533081 seconds Found communities [-1, ... 26], with sizes: [233, 1160, 378, 215, 180, 173, 165, 147, 114, 87, 84, 79, 74, 72, 66, 62, 53, 47, 46, 45, 30, 26, 25, 25, 24, 13, 12, 12] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6108925342559814 seconds Jaccard graph constructed in 0.6344013214111328 seconds Wrote graph to binary file in 0.21292638778686523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919472 Louvain completed 21 runs in 1.2828552722930908 seconds PhenoGraph complete in 2.7552881240844727 seconds Found communities [-1, ... 24], with sizes: [243, 1152, 424, 233, 169, 155, 147, 142, 127, 90, 86, 83, 74, 70, 63, 61, 50, 47, 41, 41, 33, 31, 27, 23, 19, 16] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.612246036529541 seconds Jaccard graph constructed in 0.6014208793640137 seconds Wrote graph to binary file in 0.0702052116394043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918672 Louvain completed 21 runs in 1.3591969013214111 seconds PhenoGraph complete in 2.6636412143707275 seconds Found communities [-1, ... 22], with sizes: [176, 1206, 387, 240, 195, 163, 158, 133, 120, 115, 96, 85, 81, 74, 71, 63, 48, 48, 47, 43, 43, 22, 20, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5166823863983154 seconds Jaccard graph constructed in 0.6855242252349854 seconds Wrote graph to binary file in 0.062480926513671875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91928 Louvain completed 21 runs in 1.307063341140747 seconds PhenoGraph complete in 2.595158576965332 seconds Found communities [-1, ... 21], with sizes: [257, 1075, 446, 235, 170, 160, 159, 157, 144, 126, 121, 87, 80, 69, 64, 62, 58, 47, 44, 29, 26, 18, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5155360698699951 seconds Jaccard graph constructed in 0.5446298122406006 seconds Wrote graph to binary file in 0.20676875114440918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91494 Louvain completed 21 runs in 1.2904012203216553 seconds PhenoGraph complete in 2.576587677001953 seconds Found communities [-1, ... 25], with sizes: [239, 1146, 354, 230, 190, 177, 141, 135, 131, 117, 88, 78, 77, 74, 65, 63, 47, 46, 45, 43, 42, 34, 22, 19, 18, 13, 13] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6088831424713135 seconds Jaccard graph constructed in 0.5589909553527832 seconds Wrote graph to binary file in 0.06251049041748047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919988 After 6 runs, maximum modularity is Q = 0.921072 Louvain completed 26 runs in 1.6807591915130615 seconds PhenoGraph complete in 2.935931921005249 seconds Found communities [-1, ... 24], with sizes: [241, 975, 385, 306, 248, 194, 191, 137, 121, 85, 84, 83, 83, 70, 61, 58, 48, 47, 46, 44, 34, 31, 24, 19, 16, 16] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5100588798522949 seconds Jaccard graph constructed in 0.5263171195983887 seconds Wrote graph to binary file in 0.19574475288391113 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917977 After 8 runs, maximum modularity is Q = 0.919186 Louvain completed 28 runs in 1.8500137329101562 seconds PhenoGraph complete in 3.10420298576355 seconds Found communities [-1, ... 22], with sizes: [236, 1136, 367, 235, 181, 176, 173, 154, 121, 118, 82, 78, 77, 71, 71, 67, 56, 51, 47, 43, 37, 32, 22, 16] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5084021091461182 seconds Jaccard graph constructed in 0.5386958122253418 seconds Wrote graph to binary file in 0.060370683670043945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917701 After 3 runs, maximum modularity is Q = 0.918983 Louvain completed 23 runs in 1.5238690376281738 seconds PhenoGraph complete in 2.6456220149993896 seconds Found communities [-1, ... 26], with sizes: [244, 1131, 379, 211, 198, 177, 156, 156, 129, 91, 86, 72, 72, 67, 66, 61, 47, 46, 44, 42, 32, 31, 24, 21, 17, 17, 16, 14] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5151913166046143 seconds Jaccard graph constructed in 0.560924768447876 seconds Wrote graph to binary file in 0.20106983184814453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919616 Louvain completed 21 runs in 1.2885427474975586 seconds PhenoGraph complete in 2.5867364406585693 seconds Found communities [-1, ... 23], with sizes: [236, 1150, 363, 212, 198, 173, 167, 167, 124, 118, 92, 89, 74, 71, 68, 59, 56, 47, 43, 42, 26, 23, 21, 16, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5092031955718994 seconds Jaccard graph constructed in 0.5330150127410889 seconds Wrote graph to binary file in 0.05854654312133789 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918719 Louvain completed 21 runs in 1.2392961978912354 seconds PhenoGraph complete in 2.3549587726593018 seconds Found communities [-1, ... 25], with sizes: [246, 1137, 321, 248, 221, 185, 160, 150, 123, 119, 84, 83, 76, 76, 62, 52, 48, 44, 41, 32, 29, 25, 21, 20, 19, 14, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5080583095550537 seconds Jaccard graph constructed in 0.5296463966369629 seconds Wrote graph to binary file in 0.1806485652923584 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919294 Louvain completed 21 runs in 1.2821345329284668 seconds PhenoGraph complete in 2.513993263244629 seconds Found communities [-1, ... 22], with sizes: [220, 1183, 362, 249, 170, 168, 158, 152, 118, 107, 105, 95, 89, 71, 64, 63, 50, 46, 46, 41, 28, 27, 18, 17] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5081584453582764 seconds Jaccard graph constructed in 0.5443904399871826 seconds Wrote graph to binary file in 0.059035539627075195 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920404 Louvain completed 21 runs in 1.2713992595672607 seconds PhenoGraph complete in 2.4024908542633057 seconds Found communities [-1, ... 23], with sizes: [215, 1118, 481, 204, 174, 165, 161, 143, 127, 118, 112, 88, 77, 69, 68, 60, 48, 48, 47, 37, 34, 17, 13, 12, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.611882209777832 seconds Jaccard graph constructed in 0.5448915958404541 seconds Wrote graph to binary file in 0.17878127098083496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920401 Louvain completed 21 runs in 1.2232611179351807 seconds PhenoGraph complete in 2.578368663787842 seconds Found communities [-1, ... 25], with sizes: [264, 1095, 350, 238, 201, 190, 176, 166, 124, 99, 95, 84, 81, 68, 59, 46, 46, 44, 42, 40, 32, 22, 20, 20, 18, 14, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5093872547149658 seconds Jaccard graph constructed in 0.5328578948974609 seconds Wrote graph to binary file in 0.1910557746887207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918529 Louvain completed 21 runs in 1.3119118213653564 seconds PhenoGraph complete in 2.56008243560791 seconds Found communities [-1, ... 27], with sizes: [233, 1125, 392, 220, 188, 177, 174, 159, 97, 91, 88, 88, 78, 63, 59, 57, 55, 47, 44, 37, 29, 28, 24, 22, 20, 15, 13, 13, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.915266752243042 seconds Jaccard graph constructed in 0.5336551666259766 seconds Wrote graph to binary file in 0.05922102928161621 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.921915 Louvain completed 21 runs in 1.2477316856384277 seconds PhenoGraph complete in 2.769258499145508 seconds Found communities [-1, ... 26], with sizes: [266, 840, 352, 276, 222, 195, 190, 154, 140, 121, 117, 89, 84, 81, 74, 73, 57, 47, 44, 44, 32, 31, 29, 28, 21, 16, 13, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5139877796173096 seconds Jaccard graph constructed in 0.5335690975189209 seconds Wrote graph to binary file in 0.19356060028076172 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917977 Louvain completed 21 runs in 1.2736947536468506 seconds PhenoGraph complete in 2.529722213745117 seconds Found communities [-1, ... 23], with sizes: [222, 1130, 403, 239, 201, 184, 166, 133, 118, 111, 94, 79, 75, 74, 65, 60, 47, 46, 45, 43, 33, 27, 21, 17, 14] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6103405952453613 seconds Jaccard graph constructed in 0.534160852432251 seconds Wrote graph to binary file in 0.05935263633728027 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918903 After 14 runs, maximum modularity is Q = 0.919967 Louvain completed 34 runs in 2.1069209575653076 seconds PhenoGraph complete in 3.3257455825805664 seconds Found communities [-1, ... 25], with sizes: [240, 1117, 412, 208, 195, 186, 183, 165, 114, 111, 86, 77, 72, 62, 60, 52, 48, 47, 39, 33, 27, 26, 24, 24, 14, 14, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9112300872802734 seconds Jaccard graph constructed in 0.5613467693328857 seconds Wrote graph to binary file in 0.20331883430480957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918831 Louvain completed 21 runs in 1.2733023166656494 seconds PhenoGraph complete in 2.965576410293579 seconds Found communities [-1, ... 24], with sizes: [247, 1104, 347, 239, 219, 201, 159, 144, 132, 86, 85, 83, 80, 80, 67, 60, 51, 46, 45, 40, 28, 27, 22, 21, 19, 15] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5155503749847412 seconds Jaccard graph constructed in 0.5484151840209961 seconds Wrote graph to binary file in 0.06045269966125488 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919968 Louvain completed 21 runs in 1.283388376235962 seconds PhenoGraph complete in 2.423159599304199 seconds Found communities [-1, ... 24], with sizes: [253, 1142, 383, 233, 185, 172, 161, 148, 130, 109, 85, 80, 78, 69, 67, 57, 46, 42, 38, 34, 30, 30, 26, 23, 15, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.611957311630249 seconds Jaccard graph constructed in 0.554969310760498 seconds Wrote graph to binary file in 0.20175743103027344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920297 After 2 runs, maximum modularity is Q = 0.921505 Louvain completed 22 runs in 1.5158917903900146 seconds PhenoGraph complete in 2.899604082107544 seconds Found communities [-1, ... 25], with sizes: [239, 1132, 374, 228, 192, 159, 152, 152, 141, 100, 87, 73, 72, 64, 62, 60, 60, 47, 46, 46, 34, 29, 27, 21, 20, 16, 14] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6114456653594971 seconds Jaccard graph constructed in 0.5474045276641846 seconds Wrote graph to binary file in 0.05917859077453613 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918802 Louvain completed 21 runs in 1.2539029121398926 seconds PhenoGraph complete in 2.4851999282836914 seconds Found communities [-1, ... 23], with sizes: [253, 1092, 367, 228, 199, 193, 165, 161, 120, 120, 109, 91, 79, 76, 73, 53, 47, 46, 42, 37, 29, 21, 18, 17, 11]
sc.pp.normalize_per_cell(D326_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Pro1) # log transform the data
D326_Biop_Pro1.raw = D326_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D326_Biop_Pro1 = D326_Biop_Pro1[:, D326_Biop_Pro1.var['ribo_genes']]
D326_Biop_Pro1
View of AnnData object with n_obs × n_vars = 2918 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D339_Biop_Pro1 = sc.read_10x_mtx(
'./D339_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Pro1.var_names_make_unique()
D339_Biop_Pro1.obs['manip'] = 'D339_Biop_Pro1'
D339_Biop_Pro1.obs['position'] = 'Proximal'
D339_Biop_Pro1.obs['method'] = 'Biopsy'
D339_Biop_Pro1.obs['donor'] = 'D339'
D339_Biop_Pro1.obs['name'] = ['D339_Biop_Pro1_' + s for s in list(D339_Biop_Pro1.obs.index)]
D339_Biop_Pro1.obs_names = D339_Biop_Pro1.obs['name']
D339_Biop_Pro1
... reading from cache file ./cache/D339_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 762 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D339_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=0)
mito_genes = D339_Biop_Pro1.var_names.str.startswith('MT-')
D339_Biop_Pro1.obs['percent_mito'] = np.sum(
D339_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.obs['n_counts'] = D339_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Pro1.to_df())
ribo_genes = D339_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Pro1.obs['percent_ribo'] = np.sum(
D339_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=500)
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['n_counts'] < 40000, :]
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['percent_mito'] < 0.2, :]
filtered out 8 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D339_Biop_Pro1.X, expected_doublet_rate=0.006)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D339_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.05 Detected doublet rate = 0.1% Estimated detectable doublet fraction = 36.6% Overall doublet rate: Expected = 0.6% Estimated = 0.4% Elapsed time: 0.4 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebf9c2898>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb98aba58>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Pro1.X).predict()
D339_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1105043888092041 seconds Jaccard graph constructed in 0.3054227828979492 seconds Wrote graph to binary file in 0.021872520446777344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871843 Louvain completed 21 runs in 0.8053634166717529 seconds PhenoGraph complete in 1.2517876625061035 seconds Found communities [-1, ... 13], with sizes: [108, 112, 98, 90, 77, 65, 65, 65, 54, 51, 47, 33, 31, 28, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11123776435852051 seconds Jaccard graph constructed in 0.30702853202819824 seconds Wrote graph to binary file in 0.02108168601989746 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.864588 Louvain completed 21 runs in 0.8700978755950928 seconds PhenoGraph complete in 1.3188762664794922 seconds Found communities [-1, ... 12], with sizes: [143, 109, 108, 87, 76, 62, 58, 57, 51, 50, 50, 37, 29, 19] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10672926902770996 seconds Jaccard graph constructed in 0.30466508865356445 seconds Wrote graph to binary file in 0.022786855697631836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876061 Louvain completed 21 runs in 0.984729528427124 seconds PhenoGraph complete in 1.4289076328277588 seconds Found communities [-1, ... 14], with sizes: [123, 111, 101, 96, 68, 67, 61, 58, 54, 50, 45, 28, 21, 20, 17, 16] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11058402061462402 seconds Jaccard graph constructed in 0.2976970672607422 seconds Wrote graph to binary file in 0.018837690353393555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870992 Louvain completed 21 runs in 0.9724931716918945 seconds PhenoGraph complete in 1.4083311557769775 seconds Found communities [-1, ... 13], with sizes: [101, 114, 99, 76, 75, 72, 65, 61, 55, 53, 52, 34, 33, 25, 21] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10963940620422363 seconds Jaccard graph constructed in 0.3056976795196533 seconds Wrote graph to binary file in 0.021270751953125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870528 Louvain completed 21 runs in 0.8304357528686523 seconds PhenoGraph complete in 1.2748208045959473 seconds Found communities [-1, ... 13], with sizes: [125, 110, 103, 87, 83, 71, 58, 56, 49, 48, 45, 31, 28, 27, 15] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11102032661437988 seconds Jaccard graph constructed in 0.28727102279663086 seconds Wrote graph to binary file in 0.020878076553344727 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.8745 Louvain completed 21 runs in 0.9878203868865967 seconds PhenoGraph complete in 1.4170212745666504 seconds Found communities [-1, ... 14], with sizes: [111, 116, 97, 84, 76, 75, 68, 57, 55, 51, 40, 35, 20, 20, 18, 13] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10653018951416016 seconds Jaccard graph constructed in 0.3100135326385498 seconds Wrote graph to binary file in 0.019186973571777344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873832 Louvain completed 21 runs in 0.839667558670044 seconds PhenoGraph complete in 1.2862062454223633 seconds Found communities [-1, ... 13], with sizes: [134, 106, 95, 93, 84, 67, 66, 50, 49, 44, 43, 30, 27, 25, 23] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11025357246398926 seconds Jaccard graph constructed in 0.29729771614074707 seconds Wrote graph to binary file in 0.19408893585205078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.866812 Louvain completed 21 runs in 0.8526337146759033 seconds PhenoGraph complete in 1.4625613689422607 seconds Found communities [-1, ... 13], with sizes: [135, 127, 114, 92, 84, 70, 53, 47, 44, 41, 39, 29, 25, 25, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10551738739013672 seconds Jaccard graph constructed in 0.30333685874938965 seconds Wrote graph to binary file in 0.020332813262939453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874799 Louvain completed 21 runs in 0.8151609897613525 seconds PhenoGraph complete in 1.252237319946289 seconds Found communities [-1, ... 14], with sizes: [102, 112, 100, 75, 74, 73, 72, 63, 59, 47, 45, 29, 27, 24, 19, 15] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11081314086914062 seconds Jaccard graph constructed in 0.2903783321380615 seconds Wrote graph to binary file in 0.017891645431518555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870067 Louvain completed 21 runs in 0.8367900848388672 seconds PhenoGraph complete in 1.2644429206848145 seconds Found communities [-1, ... 12], with sizes: [135, 100, 93, 92, 81, 71, 70, 61, 60, 49, 46, 28, 26, 24] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10536527633666992 seconds Jaccard graph constructed in 0.28545403480529785 seconds Wrote graph to binary file in 0.020786762237548828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873652 Louvain completed 21 runs in 0.8139078617095947 seconds PhenoGraph complete in 1.2330331802368164 seconds Found communities [-1, ... 13], with sizes: [116, 122, 95, 83, 80, 76, 72, 56, 50, 48, 44, 34, 22, 21, 17] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11239171028137207 seconds Jaccard graph constructed in 0.3006908893585205 seconds Wrote graph to binary file in 0.02051854133605957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871184 After 2 runs, maximum modularity is Q = 0.872313 After 3 runs, maximum modularity is Q = 0.874645 Louvain completed 23 runs in 1.3586170673370361 seconds PhenoGraph complete in 1.8027243614196777 seconds Found communities [-1, ... 13], with sizes: [114, 115, 93, 84, 77, 75, 68, 65, 64, 49, 43, 26, 26, 19, 18] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1068112850189209 seconds Jaccard graph constructed in 0.29584717750549316 seconds Wrote graph to binary file in 0.020511865615844727 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87659 Louvain completed 21 runs in 0.8569049835205078 seconds PhenoGraph complete in 1.2886419296264648 seconds Found communities [-1, ... 14], with sizes: [99, 129, 94, 85, 78, 61, 59, 56, 52, 51, 49, 36, 31, 24, 19, 13] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10520482063293457 seconds Jaccard graph constructed in 0.29891347885131836 seconds Wrote graph to binary file in 0.01911640167236328 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873772 Louvain completed 21 runs in 0.8438427448272705 seconds PhenoGraph complete in 1.274444341659546 seconds Found communities [-1, ... 13], with sizes: [102, 113, 92, 88, 79, 69, 62, 56, 52, 48, 47, 40, 34, 30, 24] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10613346099853516 seconds Jaccard graph constructed in 0.28910326957702637 seconds Wrote graph to binary file in 0.02112102508544922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869099 After 7 runs, maximum modularity is Q = 0.87019 Louvain completed 27 runs in 1.1831326484680176 seconds PhenoGraph complete in 1.6093153953552246 seconds Found communities [-1, ... 13], with sizes: [100, 126, 89, 88, 75, 74, 71, 53, 49, 48, 48, 33, 30, 29, 23] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10950160026550293 seconds Jaccard graph constructed in 0.2922396659851074 seconds Wrote graph to binary file in 0.021924734115600586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871361 After 2 runs, maximum modularity is Q = 0.873418 After 6 runs, maximum modularity is Q = 0.874556 Louvain completed 26 runs in 1.2192704677581787 seconds PhenoGraph complete in 1.6515581607818604 seconds Found communities [-1, ... 13], with sizes: [115, 115, 108, 85, 77, 73, 61, 53, 51, 50, 48, 30, 25, 23, 22] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1106576919555664 seconds Jaccard graph constructed in 0.29552173614501953 seconds Wrote graph to binary file in 0.1611185073852539 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865542 After 2 runs, maximum modularity is Q = 0.869601 After 3 runs, maximum modularity is Q = 0.870671 After 4 runs, maximum modularity is Q = 0.872093 Louvain completed 24 runs in 1.3465197086334229 seconds PhenoGraph complete in 1.9232587814331055 seconds Found communities [-1, ... 12], with sizes: [128, 116, 100, 89, 76, 70, 63, 59, 57, 51, 46, 30, 26, 25] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10527610778808594 seconds Jaccard graph constructed in 0.30194950103759766 seconds Wrote graph to binary file in 0.019508838653564453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865154 After 2 runs, maximum modularity is Q = 0.866234 Louvain completed 22 runs in 1.0532629489898682 seconds PhenoGraph complete in 1.490898609161377 seconds Found communities [-1, ... 13], with sizes: [115, 119, 98, 82, 81, 71, 61, 54, 51, 49, 47, 33, 31, 26, 18] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11021256446838379 seconds Jaccard graph constructed in 0.2950124740600586 seconds Wrote graph to binary file in 0.019244670867919922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869625 After 14 runs, maximum modularity is Q = 0.870629 Louvain completed 34 runs in 1.3480546474456787 seconds PhenoGraph complete in 1.7820680141448975 seconds Found communities [-1, ... 13], with sizes: [101, 124, 109, 91, 86, 70, 69, 50, 48, 48, 45, 36, 26, 20, 13] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10982561111450195 seconds Jaccard graph constructed in 0.3057708740234375 seconds Wrote graph to binary file in 0.01995229721069336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870191 Louvain completed 21 runs in 0.7929477691650391 seconds PhenoGraph complete in 1.2375876903533936 seconds Found communities [-1, ... 14], with sizes: [134, 115, 85, 82, 70, 67, 66, 47, 46, 45, 44, 42, 39, 22, 21, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10883021354675293 seconds Jaccard graph constructed in 0.2888777256011963 seconds Wrote graph to binary file in 0.019455909729003906 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874177 Louvain completed 21 runs in 0.8041174411773682 seconds PhenoGraph complete in 1.2286975383758545 seconds Found communities [-1, ... 12], with sizes: [106, 113, 101, 94, 78, 68, 67, 53, 52, 46, 46, 39, 38, 35] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10550808906555176 seconds Jaccard graph constructed in 0.29207921028137207 seconds Wrote graph to binary file in 0.02077507972717285 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868459 After 17 runs, maximum modularity is Q = 0.869754 Louvain completed 37 runs in 1.6946866512298584 seconds PhenoGraph complete in 2.1203715801239014 seconds Found communities [-1, ... 13], with sizes: [118, 115, 99, 80, 77, 72, 64, 60, 50, 49, 43, 36, 27, 24, 22] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10578250885009766 seconds Jaccard graph constructed in 0.3411259651184082 seconds Wrote graph to binary file in 0.030796527862548828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872573 Louvain completed 21 runs in 0.8219902515411377 seconds PhenoGraph complete in 1.3151566982269287 seconds Found communities [-1, ... 12], with sizes: [134, 110, 94, 93, 90, 69, 61, 57, 56, 48, 47, 28, 27, 22] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1129140853881836 seconds Jaccard graph constructed in 0.2823147773742676 seconds Wrote graph to binary file in 0.14710569381713867 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873892 Louvain completed 21 runs in 0.825577974319458 seconds PhenoGraph complete in 1.3753232955932617 seconds Found communities [-1, ... 12], with sizes: [130, 120, 93, 90, 81, 80, 60, 52, 52, 49, 48, 38, 22, 21] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10958337783813477 seconds Jaccard graph constructed in 0.3104887008666992 seconds Wrote graph to binary file in 0.020822525024414062 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.86721 After 3 runs, maximum modularity is Q = 0.868278 Louvain completed 23 runs in 1.0420918464660645 seconds PhenoGraph complete in 1.4943444728851318 seconds Found communities [-1, ... 14], with sizes: [114, 118, 111, 86, 64, 60, 57, 52, 50, 46, 43, 42, 26, 26, 24, 17]
sc.pp.normalize_per_cell(D339_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Pro1) # log transform the data
D339_Biop_Pro1.raw = D339_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D339_Biop_Pro1 = D339_Biop_Pro1[:, D339_Biop_Pro1.var['ribo_genes']]
D339_Biop_Pro1
View of AnnData object with n_obs × n_vars = 749 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D344_Biop_Pro1 = sc.read_10x_mtx(
'./D344_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Pro1.var_names_make_unique()
D344_Biop_Pro1.obs['manip'] = 'D344_Biop_Pro1'
D344_Biop_Pro1.obs['position'] = 'Proximal'
D344_Biop_Pro1.obs['method'] = 'Biopsy'
D344_Biop_Pro1.obs['donor'] = 'D344'
D344_Biop_Pro1.obs['name'] = ['D344_Biop_Pro1_' + s for s in list(D344_Biop_Pro1.obs.index)]
D344_Biop_Pro1.obs_names = D344_Biop_Pro1.obs['name']
D344_Biop_Pro1
... reading from cache file ./cache/D344_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 313 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D344_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=0)
mito_genes = D344_Biop_Pro1.var_names.str.startswith('MT-')
D344_Biop_Pro1.obs['percent_mito'] = np.sum(
D344_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.obs['n_counts'] = D344_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Pro1.to_df())
ribo_genes = D344_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Pro1.obs['percent_ribo'] = np.sum(
D344_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=500)
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['n_counts'] < 40000, :]
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['percent_mito'] < 0.15, :]
filtered out 7 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D344_Biop_Pro1.X, expected_doublet_rate=0.004)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D344_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.02 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 26.2% Overall doublet rate: Expected = 0.4% Estimated = 2.5% Elapsed time: 0.2 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9d3ba58>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfd1e860>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Pro1.X).predict()
D344_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10980510711669922 seconds Jaccard graph constructed in 0.2169487476348877 seconds Wrote graph to binary file in 0.007748126983642578 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.809501 Louvain completed 21 runs in 0.8823840618133545 seconds PhenoGraph complete in 1.2258474826812744 seconds Found communities [-1, ... 6], with sizes: [100, 57, 49, 47, 43, 40, 26, 19] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11079072952270508 seconds Jaccard graph constructed in 0.22571182250976562 seconds Wrote graph to binary file in 0.008206605911254883 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.807092 Louvain completed 21 runs in 0.7661454677581787 seconds PhenoGraph complete in 1.1178562641143799 seconds Found communities [-1, ... 7], with sizes: [80, 60, 54, 51, 42, 41, 26, 15, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10866880416870117 seconds Jaccard graph constructed in 0.2189793586730957 seconds Wrote graph to binary file in 0.012250185012817383 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.804122 Louvain completed 21 runs in 0.7788655757904053 seconds PhenoGraph complete in 1.1266555786132812 seconds Found communities [-1, ... 6], with sizes: [77, 70, 56, 50, 48, 35, 34, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10855412483215332 seconds Jaccard graph constructed in 0.20752477645874023 seconds Wrote graph to binary file in 0.011024951934814453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.806949 Louvain completed 21 runs in 0.7930207252502441 seconds PhenoGraph complete in 1.1296765804290771 seconds Found communities [-1, ... 6], with sizes: [107, 53, 47, 44, 42, 41, 30, 17] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10944557189941406 seconds Jaccard graph constructed in 0.2146296501159668 seconds Wrote graph to binary file in 0.010806798934936523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.801462 Louvain completed 21 runs in 0.7459042072296143 seconds PhenoGraph complete in 1.090465784072876 seconds Found communities [-1, ... 6], with sizes: [86, 53, 52, 48, 48, 47, 29, 18] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11103200912475586 seconds Jaccard graph constructed in 0.21146082878112793 seconds Wrote graph to binary file in 0.009027242660522461 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.801157 Louvain completed 21 runs in 0.7538511753082275 seconds PhenoGraph complete in 1.0926225185394287 seconds Found communities [-1, ... 8], with sizes: [85, 56, 46, 45, 41, 36, 25, 23, 12, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10889720916748047 seconds Jaccard graph constructed in 0.20839786529541016 seconds Wrote graph to binary file in 0.012212038040161133 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.794684 Louvain completed 21 runs in 0.8640484809875488 seconds PhenoGraph complete in 1.2027547359466553 seconds Found communities [-1, ... 8], with sizes: [76, 49, 47, 45, 38, 32, 31, 26, 26, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1086585521697998 seconds Jaccard graph constructed in 0.2087383270263672 seconds Wrote graph to binary file in 0.013257026672363281 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.802179 Louvain completed 21 runs in 0.7582082748413086 seconds PhenoGraph complete in 1.0966300964355469 seconds Found communities [-1, ... 6], with sizes: [87, 62, 55, 47, 44, 42, 33, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11080551147460938 seconds Jaccard graph constructed in 0.21363544464111328 seconds Wrote graph to binary file in 0.008567571640014648 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.804379 Louvain completed 21 runs in 0.7859487533569336 seconds PhenoGraph complete in 1.1277024745941162 seconds Found communities [-1, ... 8], with sizes: [75, 48, 46, 41, 36, 34, 32, 31, 27, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1095590591430664 seconds Jaccard graph constructed in 0.22375011444091797 seconds Wrote graph to binary file in 0.008483171463012695 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.814029 Louvain completed 21 runs in 0.8890140056610107 seconds PhenoGraph complete in 1.2399089336395264 seconds Found communities [-1, ... 6], with sizes: [106, 55, 45, 43, 41, 40, 27, 24] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10959768295288086 seconds Jaccard graph constructed in 0.22372150421142578 seconds Wrote graph to binary file in 0.011902332305908203 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.807101 Louvain completed 21 runs in 0.807478666305542 seconds PhenoGraph complete in 1.164170503616333 seconds Found communities [-1, ... 5], with sizes: [101, 69, 54, 47, 44, 33, 33] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1050570011138916 seconds Jaccard graph constructed in 0.23449349403381348 seconds Wrote graph to binary file in 0.00814366340637207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.788124 Louvain completed 21 runs in 0.8163847923278809 seconds PhenoGraph complete in 1.1744191646575928 seconds Found communities [-1, ... 6], with sizes: [92, 65, 52, 44, 42, 32, 31, 23] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10597991943359375 seconds Jaccard graph constructed in 0.22053265571594238 seconds Wrote graph to binary file in 0.008925199508666992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.794395 Louvain completed 21 runs in 0.7983870506286621 seconds PhenoGraph complete in 1.1408591270446777 seconds Found communities [-1, ... 6], with sizes: [76, 62, 59, 53, 44, 41, 25, 21] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10756349563598633 seconds Jaccard graph constructed in 0.21567511558532715 seconds Wrote graph to binary file in 0.011394977569580078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.80815 Louvain completed 21 runs in 0.9231748580932617 seconds PhenoGraph complete in 1.266977310180664 seconds Found communities [-1, ... 7], with sizes: [69, 60, 57, 50, 47, 42, 31, 13, 12] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11142158508300781 seconds Jaccard graph constructed in 0.2341620922088623 seconds Wrote graph to binary file in 0.00862741470336914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.815264 Louvain completed 21 runs in 0.8969509601593018 seconds PhenoGraph complete in 1.258319616317749 seconds Found communities [-1, ... 7], with sizes: [98, 47, 44, 44, 41, 36, 27, 24, 20] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10501313209533691 seconds Jaccard graph constructed in 0.2125685214996338 seconds Wrote graph to binary file in 0.19133543968200684 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.797398 Louvain completed 21 runs in 0.853858470916748 seconds PhenoGraph complete in 1.3762588500976562 seconds Found communities [-1, ... 5], with sizes: [79, 72, 55, 54, 50, 45, 26] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11233663558959961 seconds Jaccard graph constructed in 0.2313520908355713 seconds Wrote graph to binary file in 0.013152837753295898 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.805367 Louvain completed 21 runs in 0.7601668834686279 seconds PhenoGraph complete in 1.12748122215271 seconds Found communities [-1, ... 7], with sizes: [78, 63, 52, 44, 44, 39, 25, 23, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10541272163391113 seconds Jaccard graph constructed in 0.24375271797180176 seconds Wrote graph to binary file in 0.012204170227050781 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.79969 Louvain completed 21 runs in 0.7544903755187988 seconds PhenoGraph complete in 1.1274042129516602 seconds Found communities [-1, ... 6], with sizes: [99, 64, 45, 44, 41, 35, 29, 24] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11178898811340332 seconds Jaccard graph constructed in 0.2444000244140625 seconds Wrote graph to binary file in 0.015204668045043945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.804987 Louvain completed 21 runs in 0.8856735229492188 seconds PhenoGraph complete in 1.2687458992004395 seconds Found communities [-1, ... 7], with sizes: [93, 55, 45, 43, 42, 41, 30, 21, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1050407886505127 seconds Jaccard graph constructed in 0.2237563133239746 seconds Wrote graph to binary file in 0.0132598876953125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.811552 Louvain completed 21 runs in 0.8793680667877197 seconds PhenoGraph complete in 1.2291104793548584 seconds Found communities [-1, ... 6], with sizes: [88, 62, 59, 55, 41, 38, 27, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10558271408081055 seconds Jaccard graph constructed in 0.22791576385498047 seconds Wrote graph to binary file in 0.012525081634521484 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.797517 Louvain completed 21 runs in 0.7461190223693848 seconds PhenoGraph complete in 1.116572618484497 seconds Found communities [-1, ... 6], with sizes: [88, 55, 53, 50, 43, 35, 31, 26] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10491681098937988 seconds Jaccard graph constructed in 0.2271714210510254 seconds Wrote graph to binary file in 0.013199329376220703 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.801786 Louvain completed 21 runs in 0.8835361003875732 seconds PhenoGraph complete in 1.236802101135254 seconds Found communities [-1, ... 5], with sizes: [87, 57, 54, 53, 50, 48, 32] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10529279708862305 seconds Jaccard graph constructed in 0.21335053443908691 seconds Wrote graph to binary file in 0.013172149658203125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.815623 Louvain completed 21 runs in 0.7665512561798096 seconds PhenoGraph complete in 1.106257677078247 seconds Found communities [-1, ... 5], with sizes: [105, 59, 57, 47, 45, 42, 26] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10492658615112305 seconds Jaccard graph constructed in 0.23510122299194336 seconds Wrote graph to binary file in 0.01239776611328125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.800669 Louvain completed 21 runs in 0.7814671993255615 seconds PhenoGraph complete in 1.1406383514404297 seconds Found communities [-1, ... 6], with sizes: [89, 63, 59, 46, 43, 40, 30, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10491538047790527 seconds Jaccard graph constructed in 0.23580455780029297 seconds Wrote graph to binary file in 0.011727333068847656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.806018 Louvain completed 21 runs in 0.7661678791046143 seconds PhenoGraph complete in 1.1444058418273926 seconds Found communities [-1, ... 5], with sizes: [94, 59, 58, 48, 46, 43, 33]
sc.pp.normalize_per_cell(D344_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Pro1) # log transform the data
D344_Biop_Pro1.raw = D344_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D344_Biop_Pro1 = D344_Biop_Pro1[:, D344_Biop_Pro1.var['ribo_genes']]
D344_Biop_Pro1
View of AnnData object with n_obs × n_vars = 305 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D353_Biop_Pro1 = sc.read_10x_mtx(
'./D353_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Biop_Pro1.var_names_make_unique()
D353_Biop_Pro1.obs['manip'] = 'D353_Biop_Pro1'
D353_Biop_Pro1.obs['position'] = 'Proximal'
D353_Biop_Pro1.obs['method'] = 'Biopsy'
D353_Biop_Pro1.obs['donor'] = 'D353'
D353_Biop_Pro1.obs['name'] = ['D353_Biop_Pro1' + s for s in list(D353_Biop_Pro1.obs.index)]
D353_Biop_Pro1.obs_names = D353_Biop_Pro1.obs['name']
D353_Biop_Pro1
... reading from cache file ./cache/D353_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 4234 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D353_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=0)
mito_genes = D353_Biop_Pro1.var_names.str.startswith('MT-')
D353_Biop_Pro1.obs['percent_mito'] = np.sum(
D353_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.obs['n_counts'] = D353_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Pro1.to_df())
ribo_genes = D353_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Pro1.obs['percent_ribo'] = np.sum(
D353_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=500)
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['n_counts'] < 15000, :]
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['percent_mito'] < 0.25, :]
filtered out 41 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D353_Biop_Pro1.X, expected_doublet_rate=0.032)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D353_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.39 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 17.4% Overall doublet rate: Expected = 3.2% Estimated = 2.5% Elapsed time: 2.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9f09240>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb943e438>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Biop_Pro1.X).predict()
D353_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1124141216278076 seconds Jaccard graph constructed in 0.6793932914733887 seconds Wrote graph to binary file in 0.07717013359069824 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898435 Louvain completed 21 runs in 1.5983195304870605 seconds PhenoGraph complete in 3.4864113330841064 seconds Found communities [-1, ... 20], with sizes: [293, 1875, 602, 412, 354, 339, 261, 206, 169, 143, 106, 103, 64, 56, 46, 37, 36, 35, 34, 29, 22, 15] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.5156469345092773 seconds Jaccard graph constructed in 0.6703004837036133 seconds Wrote graph to binary file in 0.23721718788146973 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904477 Louvain completed 21 runs in 1.6786298751831055 seconds PhenoGraph complete in 4.118005275726318 seconds Found communities [-1, ... 18], with sizes: [263, 1908, 707, 404, 333, 314, 240, 238, 187, 111, 98, 80, 62, 61, 58, 46, 45, 38, 24, 20] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3126440048217773 seconds Jaccard graph constructed in 0.695244550704956 seconds Wrote graph to binary file in 0.2376270294189453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903017 Louvain completed 21 runs in 1.601078987121582 seconds PhenoGraph complete in 3.8658385276794434 seconds Found communities [-1, ... 19], with sizes: [256, 1906, 629, 418, 359, 349, 348, 253, 173, 88, 72, 58, 54, 47, 45, 40, 38, 37, 30, 23, 14] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3111910820007324 seconds Jaccard graph constructed in 0.6545121669769287 seconds Wrote graph to binary file in 0.07905054092407227 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894391 After 3 runs, maximum modularity is Q = 0.895731 Louvain completed 23 runs in 2.066336154937744 seconds PhenoGraph complete in 4.128947019577026 seconds Found communities [-1, ... 20], with sizes: [254, 1881, 670, 411, 371, 328, 247, 178, 126, 115, 112, 110, 109, 54, 46, 41, 38, 38, 35, 33, 21, 19] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1168477535247803 seconds Jaccard graph constructed in 0.6786074638366699 seconds Wrote graph to binary file in 0.23377561569213867 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898554 Louvain completed 21 runs in 1.6866297721862793 seconds PhenoGraph complete in 3.7325732707977295 seconds Found communities [-1, ... 18], with sizes: [247, 1953, 582, 418, 399, 368, 328, 263, 152, 108, 77, 58, 50, 46, 46, 39, 36, 30, 25, 12] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2134339809417725 seconds Jaccard graph constructed in 0.6675515174865723 seconds Wrote graph to binary file in 0.23641681671142578 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903043 Louvain completed 21 runs in 1.6272809505462646 seconds PhenoGraph complete in 3.7634260654449463 seconds Found communities [-1, ... 18], with sizes: [265, 1929, 662, 417, 383, 333, 315, 183, 171, 109, 90, 65, 63, 48, 47, 42, 41, 32, 24, 18] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.214933156967163 seconds Jaccard graph constructed in 0.672339677810669 seconds Wrote graph to binary file in 0.07712316513061523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902923 Louvain completed 21 runs in 1.6528730392456055 seconds PhenoGraph complete in 3.6350204944610596 seconds Found communities [-1, ... 21], with sizes: [250, 1901, 666, 397, 348, 316, 262, 220, 211, 121, 104, 84, 63, 46, 44, 39, 32, 31, 28, 26, 23, 13, 12] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1127710342407227 seconds Jaccard graph constructed in 0.6458017826080322 seconds Wrote graph to binary file in 0.21735024452209473 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898646 Louvain completed 21 runs in 1.7578880786895752 seconds PhenoGraph complete in 3.752281427383423 seconds Found communities [-1, ... 18], with sizes: [232, 1873, 674, 409, 378, 367, 349, 270, 110, 105, 84, 61, 57, 47, 46, 45, 40, 39, 31, 20] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3129191398620605 seconds Jaccard graph constructed in 0.6800203323364258 seconds Wrote graph to binary file in 0.0765378475189209 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895991 Louvain completed 21 runs in 1.6098217964172363 seconds PhenoGraph complete in 3.696824312210083 seconds Found communities [-1, ... 19], with sizes: [258, 1929, 581, 399, 387, 333, 249, 241, 189, 137, 110, 83, 55, 54, 47, 40, 37, 31, 29, 27, 21] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.314307451248169 seconds Jaccard graph constructed in 0.8422415256500244 seconds Wrote graph to binary file in 0.0751194953918457 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895983 After 15 runs, maximum modularity is Q = 0.897442 Louvain completed 35 runs in 2.833246946334839 seconds PhenoGraph complete in 5.0813775062561035 seconds Found communities [-1, ... 21], with sizes: [298, 1898, 600, 407, 355, 335, 251, 216, 191, 114, 112, 68, 57, 47, 46, 45, 41, 40, 38, 30, 24, 13, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3129487037658691 seconds Jaccard graph constructed in 0.669677734375 seconds Wrote graph to binary file in 0.24205803871154785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902166 Louvain completed 21 runs in 1.6867096424102783 seconds PhenoGraph complete in 3.930549144744873 seconds Found communities [-1, ... 19], with sizes: [248, 1867, 677, 403, 394, 382, 351, 157, 109, 106, 79, 77, 58, 57, 51, 47, 46, 39, 39, 32, 18] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3168745040893555 seconds Jaccard graph constructed in 0.7001020908355713 seconds Wrote graph to binary file in 0.07872438430786133 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897693 Louvain completed 21 runs in 1.6620802879333496 seconds PhenoGraph complete in 3.7772116661071777 seconds Found communities [-1, ... 16], with sizes: [250, 1954, 623, 417, 413, 352, 340, 266, 111, 98, 69, 68, 65, 55, 48, 44, 34, 30] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3143353462219238 seconds Jaccard graph constructed in 0.6794734001159668 seconds Wrote graph to binary file in 0.23280668258666992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905778 Louvain completed 21 runs in 1.6485168933868408 seconds PhenoGraph complete in 3.8949456214904785 seconds Found communities [-1, ... 19], with sizes: [299, 1903, 625, 404, 389, 361, 334, 250, 104, 85, 84, 71, 59, 51, 49, 46, 32, 32, 26, 21, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3123993873596191 seconds Jaccard graph constructed in 0.6693263053894043 seconds Wrote graph to binary file in 0.21507835388183594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901388 Louvain completed 21 runs in 1.5971555709838867 seconds PhenoGraph complete in 3.8123550415039062 seconds Found communities [-1, ... 17], with sizes: [295, 1912, 653, 384, 352, 345, 250, 195, 179, 149, 104, 85, 81, 52, 52, 46, 41, 39, 23] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3126029968261719 seconds Jaccard graph constructed in 0.6866474151611328 seconds Wrote graph to binary file in 0.07668447494506836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898908 Louvain completed 21 runs in 1.5656707286834717 seconds PhenoGraph complete in 3.6590311527252197 seconds Found communities [-1, ... 18], with sizes: [267, 1946, 655, 401, 387, 337, 332, 260, 103, 86, 78, 74, 67, 57, 45, 41, 40, 28, 21, 12] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.4116289615631104 seconds Jaccard graph constructed in 0.6775290966033936 seconds Wrote graph to binary file in 0.23298907279968262 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901293 After 17 runs, maximum modularity is Q = 0.90241 Louvain completed 37 runs in 3.1329915523529053 seconds PhenoGraph complete in 5.473968029022217 seconds Found communities [-1, ... 20], with sizes: [272, 1909, 631, 409, 359, 333, 229, 185, 168, 134, 113, 91, 59, 57, 56, 48, 43, 38, 31, 27, 27, 18] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3149909973144531 seconds Jaccard graph constructed in 0.6827573776245117 seconds Wrote graph to binary file in 0.24138998985290527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.898244 Louvain completed 21 runs in 1.6458306312561035 seconds PhenoGraph complete in 3.905277729034424 seconds Found communities [-1, ... 20], with sizes: [264, 1854, 703, 359, 354, 347, 259, 201, 197, 106, 88, 87, 63, 62, 58, 47, 45, 43, 33, 28, 21, 18] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3126351833343506 seconds Jaccard graph constructed in 0.6465389728546143 seconds Wrote graph to binary file in 0.07648181915283203 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902466 After 7 runs, maximum modularity is Q = 0.903564 Louvain completed 27 runs in 2.278730630874634 seconds PhenoGraph complete in 4.331044673919678 seconds Found communities [-1, ... 17], with sizes: [256, 1878, 654, 441, 353, 314, 250, 248, 206, 155, 101, 81, 60, 49, 46, 46, 41, 35, 23] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3133964538574219 seconds Jaccard graph constructed in 0.6469383239746094 seconds Wrote graph to binary file in 0.24372434616088867 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902293 After 2 runs, maximum modularity is Q = 0.904136 Louvain completed 22 runs in 1.887833595275879 seconds PhenoGraph complete in 4.10889196395874 seconds Found communities [-1, ... 20], with sizes: [248, 1930, 678, 416, 383, 373, 300, 184, 105, 102, 92, 70, 46, 46, 42, 42, 40, 36, 32, 28, 22, 22] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3120794296264648 seconds Jaccard graph constructed in 0.6420383453369141 seconds Wrote graph to binary file in 0.07879233360290527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899317 After 2 runs, maximum modularity is Q = 0.901301 Louvain completed 22 runs in 1.9003074169158936 seconds PhenoGraph complete in 3.951625108718872 seconds Found communities [-1, ... 19], with sizes: [278, 1924, 615, 430, 351, 344, 240, 179, 146, 104, 94, 91, 86, 72, 60, 48, 46, 38, 37, 35, 19] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3140530586242676 seconds Jaccard graph constructed in 0.6724953651428223 seconds Wrote graph to binary file in 0.23605895042419434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896487 Louvain completed 21 runs in 1.590254306793213 seconds PhenoGraph complete in 3.8298606872558594 seconds Found communities [-1, ... 18], with sizes: [278, 1956, 627, 418, 347, 337, 246, 162, 135, 112, 103, 89, 82, 81, 59, 58, 42, 39, 35, 31] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.3131976127624512 seconds Jaccard graph constructed in 0.6590533256530762 seconds Wrote graph to binary file in 0.2455885410308838 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894787 Louvain completed 21 runs in 1.6014740467071533 seconds PhenoGraph complete in 3.8447322845458984 seconds Found communities [-1, ... 16], with sizes: [263, 1941, 636, 409, 373, 360, 344, 249, 188, 133, 62, 54, 52, 46, 44, 34, 26, 23] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9126551151275635 seconds Jaccard graph constructed in 0.675278902053833 seconds Wrote graph to binary file in 0.07746315002441406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899812 After 3 runs, maximum modularity is Q = 0.901179 Louvain completed 23 runs in 1.9313156604766846 seconds PhenoGraph complete in 3.61487078666687 seconds Found communities [-1, ... 17], with sizes: [250, 1906, 637, 412, 412, 347, 340, 263, 130, 109, 103, 56, 56, 46, 41, 39, 36, 27, 27] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.5129027366638184 seconds Jaccard graph constructed in 0.6978867053985596 seconds Wrote graph to binary file in 0.23975682258605957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901892 Louvain completed 21 runs in 1.7766077518463135 seconds PhenoGraph complete in 4.2457115650177 seconds Found communities [-1, ... 19], with sizes: [300, 1891, 613, 428, 394, 358, 335, 246, 100, 100, 85, 77, 56, 50, 47, 41, 32, 26, 24, 23, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.316267490386963 seconds Jaccard graph constructed in 0.6971895694732666 seconds Wrote graph to binary file in 0.07730555534362793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900887 Louvain completed 21 runs in 1.6947228908538818 seconds PhenoGraph complete in 3.8027455806732178 seconds Found communities [-1, ... 17], with sizes: [280, 1927, 636, 412, 377, 350, 347, 256, 157, 104, 81, 54, 46, 45, 45, 41, 38, 23, 18]
sc.pp.normalize_per_cell(D353_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Pro1) # log transform the data
D353_Biop_Pro1.raw = D353_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D353_Biop_Pro1 = D353_Biop_Pro1[:, D353_Biop_Pro1.var['ribo_genes']]
D353_Biop_Pro1
View of AnnData object with n_obs × n_vars = 4190 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D354_Biop_Pro1 = sc.read_10x_mtx(
'./D354_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Biop_Pro1.var_names_make_unique()
D354_Biop_Pro1.obs['manip'] = 'D354_Biop_Pro1'
D354_Biop_Pro1.obs['position'] = 'Proximal'
D354_Biop_Pro1.obs['method'] = 'Biopsy'
D354_Biop_Pro1.obs['donor'] = 'D354'
D354_Biop_Pro1.obs['name'] = ['D354_Biop_Pro1_' + s for s in list(D354_Biop_Pro1.obs.index)]
D354_Biop_Pro1.obs_names = D354_Biop_Pro1.obs['name']
D354_Biop_Pro1
... reading from cache file ./cache/D354_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1877 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D354_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=0)
mito_genes = D354_Biop_Pro1.var_names.str.startswith('MT-')
D354_Biop_Pro1.obs['percent_mito'] = np.sum(
D354_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.obs['n_counts'] = D354_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Pro1.to_df())
ribo_genes = D354_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Pro1.obs['percent_ribo'] = np.sum(
D354_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=500)
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['n_counts'] < 30000, :]
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['percent_mito'] < 0.15, :]
filtered out 57 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D354_Biop_Pro1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D354_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.17 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 34.1% Overall doublet rate: Expected = 1.6% Estimated = 2.1% Elapsed time: 1.2 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb4405940>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9cbff60>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Biop_Pro1.X).predict()
D354_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20798563957214355 seconds Jaccard graph constructed in 0.43631482124328613 seconds Wrote graph to binary file in 0.03902935981750488 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894808 Louvain completed 21 runs in 1.1114189624786377 seconds PhenoGraph complete in 1.8260626792907715 seconds Found communities [-1, ... 20], with sizes: [224, 369, 250, 203, 158, 151, 149, 135, 107, 85, 65, 60, 57, 54, 51, 31, 24, 22, 20, 18, 15, 13] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3108201026916504 seconds Jaccard graph constructed in 0.4447774887084961 seconds Wrote graph to binary file in 0.03623771667480469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89544 After 6 runs, maximum modularity is Q = 0.896991 Louvain completed 26 runs in 1.4794540405273438 seconds PhenoGraph complete in 2.283229351043701 seconds Found communities [-1, ... 21], with sizes: [226, 349, 251, 201, 161, 147, 139, 124, 97, 80, 79, 61, 57, 56, 45, 42, 25, 24, 23, 23, 20, 18, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3073890209197998 seconds Jaccard graph constructed in 0.43344998359680176 seconds Wrote graph to binary file in 0.23255109786987305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900435 Louvain completed 21 runs in 1.0921292304992676 seconds PhenoGraph complete in 2.0783698558807373 seconds Found communities [-1, ... 23], with sizes: [220, 348, 253, 174, 139, 131, 115, 99, 95, 94, 92, 91, 71, 61, 56, 55, 33, 24, 23, 22, 16, 15, 12, 11, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3080592155456543 seconds Jaccard graph constructed in 0.4336819648742676 seconds Wrote graph to binary file in 0.036360979080200195 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897128 After 2 runs, maximum modularity is Q = 0.899447 Louvain completed 22 runs in 1.2966358661651611 seconds PhenoGraph complete in 2.0864593982696533 seconds Found communities [-1, ... 21], with sizes: [199, 344, 245, 175, 172, 124, 123, 117, 109, 93, 92, 75, 74, 62, 56, 51, 46, 22, 21, 18, 17, 15, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30846166610717773 seconds Jaccard graph constructed in 0.4344816207885742 seconds Wrote graph to binary file in 0.035085201263427734 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895133 After 3 runs, maximum modularity is Q = 0.896925 Louvain completed 23 runs in 1.3573853969573975 seconds PhenoGraph complete in 2.1456351280212402 seconds Found communities [-1, ... 20], with sizes: [226, 352, 261, 185, 174, 135, 117, 115, 97, 92, 86, 59, 57, 53, 52, 49, 41, 31, 24, 21, 18, 16] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20606327056884766 seconds Jaccard graph constructed in 0.4269881248474121 seconds Wrote graph to binary file in 0.199462890625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895028 Louvain completed 21 runs in 1.0952200889587402 seconds PhenoGraph complete in 1.9396007061004639 seconds Found communities [-1, ... 20], with sizes: [223, 367, 265, 175, 173, 122, 114, 112, 99, 93, 85, 61, 57, 56, 56, 46, 36, 35, 26, 23, 20, 17] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30696773529052734 seconds Jaccard graph constructed in 0.4124562740325928 seconds Wrote graph to binary file in 0.03569531440734863 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892923 After 2 runs, maximum modularity is Q = 0.894562 Louvain completed 22 runs in 1.2908604145050049 seconds PhenoGraph complete in 2.0582950115203857 seconds Found communities [-1, ... 19], with sizes: [215, 358, 335, 256, 183, 142, 134, 125, 113, 91, 60, 57, 47, 24, 24, 21, 19, 18, 16, 12, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3103320598602295 seconds Jaccard graph constructed in 0.42313146591186523 seconds Wrote graph to binary file in 0.03623080253601074 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892364 After 2 runs, maximum modularity is Q = 0.894884 Louvain completed 22 runs in 1.3216302394866943 seconds PhenoGraph complete in 2.1039392948150635 seconds Found communities [-1, ... 20], with sizes: [203, 369, 285, 187, 177, 168, 110, 110, 109, 93, 84, 59, 57, 52, 45, 40, 27, 24, 19, 16, 14, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30880260467529297 seconds Jaccard graph constructed in 0.4333014488220215 seconds Wrote graph to binary file in 0.034192800521850586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893551 Louvain completed 21 runs in 1.118133783340454 seconds PhenoGraph complete in 1.9052391052246094 seconds Found communities [-1, ... 20], with sizes: [227, 358, 328, 229, 170, 156, 97, 90, 83, 82, 78, 63, 56, 55, 35, 31, 29, 25, 23, 21, 14, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20642876625061035 seconds Jaccard graph constructed in 0.43306493759155273 seconds Wrote graph to binary file in 0.19727802276611328 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890663 After 2 runs, maximum modularity is Q = 0.893039 Louvain completed 22 runs in 1.3608157634735107 seconds PhenoGraph complete in 2.2119898796081543 seconds Found communities [-1, ... 20], with sizes: [186, 356, 354, 193, 187, 161, 123, 106, 97, 71, 64, 61, 57, 56, 49, 33, 25, 24, 18, 17, 12, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3070404529571533 seconds Jaccard graph constructed in 0.419095516204834 seconds Wrote graph to binary file in 0.03542590141296387 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893239 Louvain completed 21 runs in 1.0821685791015625 seconds PhenoGraph complete in 1.8562963008880615 seconds Found communities [-1, ... 18], with sizes: [244, 372, 363, 189, 184, 134, 131, 123, 122, 73, 65, 60, 57, 41, 26, 23, 17, 14, 12, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3066115379333496 seconds Jaccard graph constructed in 0.44223952293395996 seconds Wrote graph to binary file in 0.0375819206237793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892366 Louvain completed 21 runs in 1.1158223152160645 seconds PhenoGraph complete in 1.9144954681396484 seconds Found communities [-1, ... 19], with sizes: [258, 331, 247, 244, 153, 129, 122, 100, 100, 86, 77, 77, 58, 52, 47, 41, 38, 37, 23, 22, 19] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3083064556121826 seconds Jaccard graph constructed in 0.4503061771392822 seconds Wrote graph to binary file in 0.19364523887634277 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895026 After 3 runs, maximum modularity is Q = 0.896758 Louvain completed 23 runs in 1.326650619506836 seconds PhenoGraph complete in 2.2930917739868164 seconds Found communities [-1, ... 21], with sizes: [247, 326, 258, 191, 164, 161, 126, 113, 101, 81, 79, 61, 58, 57, 53, 38, 28, 25, 25, 19, 18, 18, 14] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2068653106689453 seconds Jaccard graph constructed in 0.42003607749938965 seconds Wrote graph to binary file in 0.03583359718322754 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893939 Louvain completed 21 runs in 1.0809791088104248 seconds PhenoGraph complete in 1.7563679218292236 seconds Found communities [-1, ... 21], with sizes: [228, 347, 223, 187, 174, 164, 164, 116, 96, 96, 80, 59, 55, 49, 44, 41, 36, 25, 19, 16, 15, 14, 13] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30782556533813477 seconds Jaccard graph constructed in 0.42464208602905273 seconds Wrote graph to binary file in 0.036917686462402344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895056 Louvain completed 21 runs in 1.0969853401184082 seconds PhenoGraph complete in 1.878363847732544 seconds Found communities [-1, ... 20], with sizes: [222, 363, 328, 190, 164, 164, 136, 105, 89, 67, 58, 55, 54, 51, 46, 37, 32, 26, 22, 21, 17, 14] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20641303062438965 seconds Jaccard graph constructed in 0.4524691104888916 seconds Wrote graph to binary file in 0.20171022415161133 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893403 Louvain completed 21 runs in 1.10914945602417 seconds PhenoGraph complete in 1.9848384857177734 seconds Found communities [-1, ... 19], with sizes: [212, 351, 264, 210, 192, 145, 119, 112, 103, 98, 70, 67, 60, 56, 55, 54, 25, 20, 20, 17, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3071401119232178 seconds Jaccard graph constructed in 0.4279038906097412 seconds Wrote graph to binary file in 0.03468894958496094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893619 After 2 runs, maximum modularity is Q = 0.895078 Louvain completed 22 runs in 1.3012712001800537 seconds PhenoGraph complete in 2.0826313495635986 seconds Found communities [-1, ... 18], with sizes: [237, 349, 252, 226, 202, 162, 149, 114, 105, 101, 86, 59, 56, 40, 28, 22, 22, 19, 17, 15] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3069031238555908 seconds Jaccard graph constructed in 0.42010951042175293 seconds Wrote graph to binary file in 0.03604316711425781 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89463 Louvain completed 21 runs in 1.1117844581604004 seconds PhenoGraph complete in 1.886242151260376 seconds Found communities [-1, ... 20], with sizes: [243, 347, 235, 196, 187, 129, 121, 111, 97, 90, 79, 76, 56, 52, 52, 49, 48, 24, 23, 19, 16, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3070688247680664 seconds Jaccard graph constructed in 0.42629456520080566 seconds Wrote graph to binary file in 0.03327345848083496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893728 Louvain completed 21 runs in 1.0781338214874268 seconds PhenoGraph complete in 1.8543238639831543 seconds Found communities [-1, ... 23], with sizes: [196, 352, 244, 240, 164, 160, 136, 114, 99, 81, 71, 56, 54, 51, 37, 27, 26, 26, 23, 23, 22, 20, 16, 12, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3070995807647705 seconds Jaccard graph constructed in 0.5920555591583252 seconds Wrote graph to binary file in 0.03378438949584961 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894982 After 4 runs, maximum modularity is Q = 0.896278 After 5 runs, maximum modularity is Q = 0.897416 Louvain completed 25 runs in 1.6029915809631348 seconds PhenoGraph complete in 2.5494496822357178 seconds Found communities [-1, ... 19], with sizes: [250, 357, 244, 182, 165, 163, 142, 123, 103, 88, 86, 62, 55, 55, 40, 36, 33, 23, 20, 18, 16] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3067197799682617 seconds Jaccard graph constructed in 0.43877077102661133 seconds Wrote graph to binary file in 0.0335230827331543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895023 After 6 runs, maximum modularity is Q = 0.89621 Louvain completed 26 runs in 1.4676036834716797 seconds PhenoGraph complete in 2.2580513954162598 seconds Found communities [-1, ... 22], with sizes: [211, 343, 223, 190, 186, 155, 128, 121, 100, 91, 79, 78, 57, 54, 53, 46, 24, 23, 22, 22, 17, 14, 12, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31211042404174805 seconds Jaccard graph constructed in 0.44854235649108887 seconds Wrote graph to binary file in 0.03439927101135254 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893999 After 2 runs, maximum modularity is Q = 0.895689 Louvain completed 22 runs in 1.305464506149292 seconds PhenoGraph complete in 2.1120829582214355 seconds Found communities [-1, ... 23], with sizes: [230, 340, 252, 192, 165, 133, 111, 109, 100, 96, 94, 69, 55, 51, 50, 44, 29, 24, 21, 21, 21, 16, 15, 12, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3101367950439453 seconds Jaccard graph constructed in 0.44313836097717285 seconds Wrote graph to binary file in 0.19430088996887207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891348 After 3 runs, maximum modularity is Q = 0.892816 After 4 runs, maximum modularity is Q = 0.895152 Louvain completed 24 runs in 1.5397734642028809 seconds PhenoGraph complete in 2.5027737617492676 seconds Found communities [-1, ... 20], with sizes: [201, 346, 270, 200, 165, 120, 116, 112, 112, 105, 98, 76, 57, 54, 53, 53, 27, 27, 24, 17, 16, 12] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30808544158935547 seconds Jaccard graph constructed in 0.44527363777160645 seconds Wrote graph to binary file in 0.034067392349243164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894172 After 4 runs, maximum modularity is Q = 0.895367 After 19 runs, maximum modularity is Q = 0.896371 Louvain completed 39 runs in 2.2144389152526855 seconds PhenoGraph complete in 3.0123860836029053 seconds Found communities [-1, ... 17], with sizes: [247, 361, 253, 231, 175, 169, 127, 113, 113, 106, 60, 55, 53, 51, 47, 33, 26, 25, 16] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20731735229492188 seconds Jaccard graph constructed in 0.42447829246520996 seconds Wrote graph to binary file in 0.03425025939941406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896583 Louvain completed 21 runs in 1.082287311553955 seconds PhenoGraph complete in 1.761446475982666 seconds Found communities [-1, ... 20], with sizes: [241, 334, 251, 192, 178, 128, 123, 105, 102, 93, 92, 58, 56, 48, 43, 43, 40, 38, 25, 24, 24, 23]
sc.pp.normalize_per_cell(D354_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Pro1) # log transform the data
D354_Biop_Pro1.raw = D354_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D354_Biop_Pro1 = D354_Biop_Pro1[:, D354_Biop_Pro1.var['ribo_genes']]
D354_Biop_Pro1
View of AnnData object with n_obs × n_vars = 1809 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D363_Biop_Pro1 = sc.read_10x_mtx(
'./D363_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Biop_Pro1.var_names_make_unique()
D363_Biop_Pro1.obs['manip'] = 'D363_Biop_Pro1'
D363_Biop_Pro1.obs['position'] = 'Proximal'
D363_Biop_Pro1.obs['method'] = 'Biopsy'
D363_Biop_Pro1.obs['donor'] = 'D363'
D363_Biop_Pro1.obs['name'] = ['D363_Biop_Pro1_' + s for s in list(D363_Biop_Pro1.obs.index)]
D363_Biop_Pro1.obs_names = D363_Biop_Pro1.obs['name']
D363_Biop_Pro1
... reading from cache file ./cache/D363_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1531 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D363_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=0)
mito_genes = D363_Biop_Pro1.var_names.str.startswith('MT-')
D363_Biop_Pro1.obs['percent_mito'] = np.sum(
D363_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.obs['n_counts'] = D363_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Pro1.to_df())
ribo_genes = D363_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Pro1.obs['percent_ribo'] = np.sum(
D363_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=500)
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['n_counts'] < 15000, :]
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['percent_mito'] < 0.25, :]
filtered out 20 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D363_Biop_Pro1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D363_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.12 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 17.0% Overall doublet rate: Expected = 1.1% Estimated = 3.9% Elapsed time: 0.8 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb46fca90>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb65d5f98>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Biop_Pro1.X).predict()
D363_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30674242973327637 seconds Jaccard graph constructed in 0.4128077030181885 seconds Wrote graph to binary file in 0.02238774299621582 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869635 Louvain completed 21 runs in 1.0842034816741943 seconds PhenoGraph complete in 1.836641788482666 seconds Found communities [-1, ... 14], with sizes: [279, 496, 313, 213, 140, 76, 64, 62, 58, 41, 35, 28, 25, 19, 18, 18] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20850038528442383 seconds Jaccard graph constructed in 0.43157458305358887 seconds Wrote graph to binary file in 0.02245807647705078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871286 Louvain completed 21 runs in 1.1038267612457275 seconds PhenoGraph complete in 1.7786390781402588 seconds Found communities [-1, ... 15], with sizes: [205, 528, 265, 168, 148, 98, 90, 71, 63, 47, 40, 39, 28, 26, 26, 24, 19] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20830273628234863 seconds Jaccard graph constructed in 0.4142801761627197 seconds Wrote graph to binary file in 0.19663286209106445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868827 Louvain completed 21 runs in 1.0925955772399902 seconds PhenoGraph complete in 1.9218556880950928 seconds Found communities [-1, ... 14], with sizes: [238, 509, 216, 157, 146, 138, 70, 69, 68, 64, 59, 40, 39, 30, 24, 18] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3095426559448242 seconds Jaccard graph constructed in 0.43410587310791016 seconds Wrote graph to binary file in 0.025043487548828125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869482 Louvain completed 21 runs in 1.117567539215088 seconds PhenoGraph complete in 1.898071050643921 seconds Found communities [-1, ... 13], with sizes: [251, 532, 271, 258, 97, 96, 69, 66, 63, 45, 43, 40, 21, 19, 14] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30872035026550293 seconds Jaccard graph constructed in 0.4299960136413574 seconds Wrote graph to binary file in 0.024828195571899414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.859811 After 3 runs, maximum modularity is Q = 0.861928 After 8 runs, maximum modularity is Q = 0.863107 Louvain completed 28 runs in 1.71533203125 seconds PhenoGraph complete in 2.490504026412964 seconds Found communities [-1, ... 15], with sizes: [207, 569, 246, 138, 119, 112, 97, 64, 60, 55, 50, 49, 38, 26, 24, 17, 14] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30737853050231934 seconds Jaccard graph constructed in 0.4416193962097168 seconds Wrote graph to binary file in 0.023202896118164062 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869901 After 13 runs, maximum modularity is Q = 0.871082 Louvain completed 33 runs in 1.7465119361877441 seconds PhenoGraph complete in 2.5287082195281982 seconds Found communities [-1, ... 15], with sizes: [233, 526, 264, 240, 97, 94, 88, 73, 45, 42, 38, 38, 36, 23, 18, 17, 13] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30819153785705566 seconds Jaccard graph constructed in 0.424422025680542 seconds Wrote graph to binary file in 0.025684356689453125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868794 Louvain completed 21 runs in 1.0887198448181152 seconds PhenoGraph complete in 1.8569042682647705 seconds Found communities [-1, ... 15], with sizes: [240, 506, 265, 259, 96, 94, 72, 65, 53, 53, 40, 33, 31, 30, 20, 17, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081822395324707 seconds Jaccard graph constructed in 0.4247720241546631 seconds Wrote graph to binary file in 0.0247344970703125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.86858 After 2 runs, maximum modularity is Q = 0.869937 After 9 runs, maximum modularity is Q = 0.870986 Louvain completed 29 runs in 1.7439320087432861 seconds PhenoGraph complete in 2.511955976486206 seconds Found communities [-1, ... 15], with sizes: [268, 525, 276, 211, 147, 93, 68, 57, 55, 37, 36, 29, 26, 18, 15, 12, 12] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30760717391967773 seconds Jaccard graph constructed in 0.41756319999694824 seconds Wrote graph to binary file in 0.1711583137512207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870444 Louvain completed 21 runs in 1.0591626167297363 seconds PhenoGraph complete in 1.9658491611480713 seconds Found communities [-1, ... 18], with sizes: [254, 467, 241, 149, 140, 76, 71, 69, 67, 63, 48, 42, 38, 33, 28, 25, 22, 20, 16, 16] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3072676658630371 seconds Jaccard graph constructed in 0.4320073127746582 seconds Wrote graph to binary file in 0.0248870849609375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87192 Louvain completed 21 runs in 1.073777675628662 seconds PhenoGraph complete in 1.850381851196289 seconds Found communities [-1, ... 16], with sizes: [257, 561, 227, 194, 98, 95, 70, 67, 64, 47, 43, 35, 29, 23, 22, 19, 19, 15] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3071107864379883 seconds Jaccard graph constructed in 0.4304664134979248 seconds Wrote graph to binary file in 0.023406982421875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.867366 After 5 runs, maximum modularity is Q = 0.868573 Louvain completed 25 runs in 1.4344425201416016 seconds PhenoGraph complete in 2.2068140506744385 seconds Found communities [-1, ... 13], with sizes: [240, 509, 301, 187, 139, 102, 92, 67, 63, 46, 39, 31, 25, 24, 20] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30753302574157715 seconds Jaccard graph constructed in 0.4122314453125 seconds Wrote graph to binary file in 0.024931669235229492 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875892 Louvain completed 21 runs in 1.0930914878845215 seconds PhenoGraph complete in 1.8503565788269043 seconds Found communities [-1, ... 15], with sizes: [286, 535, 313, 151, 114, 69, 67, 63, 57, 55, 40, 36, 34, 24, 15, 13, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30872368812561035 seconds Jaccard graph constructed in 0.4181251525878906 seconds Wrote graph to binary file in 0.023557186126708984 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868154 Louvain completed 21 runs in 1.0970056056976318 seconds PhenoGraph complete in 1.8579034805297852 seconds Found communities [-1, ... 14], with sizes: [244, 504, 245, 183, 138, 118, 92, 82, 65, 55, 37, 33, 30, 24, 24, 11] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30840229988098145 seconds Jaccard graph constructed in 0.40717005729675293 seconds Wrote graph to binary file in 0.02349376678466797 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869651 Louvain completed 21 runs in 1.0891599655151367 seconds PhenoGraph complete in 1.8400168418884277 seconds Found communities [-1, ... 14], with sizes: [225, 490, 241, 225, 136, 77, 75, 67, 66, 63, 61, 60, 39, 25, 21, 14] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3091762065887451 seconds Jaccard graph constructed in 0.4022819995880127 seconds Wrote graph to binary file in 0.19213199615478516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87109 Louvain completed 21 runs in 1.099332571029663 seconds PhenoGraph complete in 2.011976480484009 seconds Found communities [-1, ... 15], with sizes: [228, 574, 293, 135, 95, 83, 77, 71, 68, 55, 42, 37, 37, 32, 24, 20, 14] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30831241607666016 seconds Jaccard graph constructed in 0.4155745506286621 seconds Wrote graph to binary file in 0.024890422821044922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865493 After 3 runs, maximum modularity is Q = 0.866636 Louvain completed 23 runs in 1.3287489414215088 seconds PhenoGraph complete in 2.0908920764923096 seconds Found communities [-1, ... 16], with sizes: [241, 491, 255, 225, 109, 100, 81, 68, 64, 51, 40, 33, 32, 27, 24, 17, 16, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.307614803314209 seconds Jaccard graph constructed in 0.45827722549438477 seconds Wrote graph to binary file in 0.024315595626831055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875204 Louvain completed 21 runs in 1.076829433441162 seconds PhenoGraph complete in 1.8799591064453125 seconds Found communities [-1, ... 15], with sizes: [253, 530, 238, 154, 107, 94, 75, 70, 66, 60, 50, 40, 34, 33, 29, 29, 23] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3083229064941406 seconds Jaccard graph constructed in 0.37384557723999023 seconds Wrote graph to binary file in 0.027457475662231445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876345 Louvain completed 21 runs in 1.0791652202606201 seconds PhenoGraph complete in 1.802321195602417 seconds Found communities [-1, ... 15], with sizes: [241, 485, 286, 273, 110, 79, 76, 68, 58, 42, 37, 30, 30, 24, 18, 17, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3073537349700928 seconds Jaccard graph constructed in 0.3943479061126709 seconds Wrote graph to binary file in 0.023504018783569336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872962 Louvain completed 21 runs in 1.0706963539123535 seconds PhenoGraph complete in 1.8065199851989746 seconds Found communities [-1, ... 14], with sizes: [277, 526, 229, 134, 118, 107, 94, 79, 64, 63, 43, 37, 36, 36, 23, 19] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30828022956848145 seconds Jaccard graph constructed in 0.41057538986206055 seconds Wrote graph to binary file in 0.02418375015258789 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.866867 Louvain completed 21 runs in 1.0627703666687012 seconds PhenoGraph complete in 1.817657232284546 seconds Found communities [-1, ... 13], with sizes: [229, 545, 264, 263, 125, 97, 67, 65, 52, 51, 35, 30, 27, 24, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30927324295043945 seconds Jaccard graph constructed in 0.4102966785430908 seconds Wrote graph to binary file in 0.16541123390197754 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876616 Louvain completed 21 runs in 1.1146433353424072 seconds PhenoGraph complete in 2.0135934352874756 seconds Found communities [-1, ... 14], with sizes: [246, 521, 252, 218, 134, 114, 60, 58, 53, 49, 46, 35, 32, 32, 24, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3083162307739258 seconds Jaccard graph constructed in 0.4117872714996338 seconds Wrote graph to binary file in 0.02314305305480957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868976 Louvain completed 21 runs in 1.114424228668213 seconds PhenoGraph complete in 1.8750827312469482 seconds Found communities [-1, ... 16], with sizes: [267, 491, 315, 156, 124, 94, 90, 65, 55, 37, 32, 31, 27, 26, 23, 22, 15, 15] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41069793701171875 seconds Jaccard graph constructed in 0.4212212562561035 seconds Wrote graph to binary file in 0.023562192916870117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870094 After 2 runs, maximum modularity is Q = 0.871666 Louvain completed 22 runs in 1.2846720218658447 seconds PhenoGraph complete in 2.1531689167022705 seconds Found communities [-1, ... 12], with sizes: [279, 507, 251, 251, 144, 76, 71, 68, 67, 57, 32, 30, 28, 24] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30826401710510254 seconds Jaccard graph constructed in 0.40161728858947754 seconds Wrote graph to binary file in 0.022974729537963867 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869537 Louvain completed 21 runs in 1.1055395603179932 seconds PhenoGraph complete in 1.8490526676177979 seconds Found communities [-1, ... 13], with sizes: [259, 527, 257, 221, 137, 97, 75, 62, 58, 57, 43, 39, 25, 17, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30946779251098633 seconds Jaccard graph constructed in 0.40761303901672363 seconds Wrote graph to binary file in 0.02495288848876953 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.867995 After 5 runs, maximum modularity is Q = 0.869488 Louvain completed 25 runs in 1.433722972869873 seconds PhenoGraph complete in 2.1903669834136963 seconds Found communities [-1, ... 14], with sizes: [227, 500, 257, 168, 127, 123, 83, 71, 66, 62, 60, 41, 34, 28, 23, 15]
sc.pp.normalize_per_cell(D363_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Pro1) # log transform the data
D363_Biop_Pro1.raw = D363_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D363_Biop_Pro1 = D363_Biop_Pro1[:, D363_Biop_Pro1.var['ribo_genes']]
D363_Biop_Pro1
View of AnnData object with n_obs × n_vars = 1508 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D367_Biop_Pro1 = sc.read_10x_mtx(
'./D367_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Biop_Pro1.var_names_make_unique()
D367_Biop_Pro1.obs['manip'] = 'D367_Biop_Pro1'
D367_Biop_Pro1.obs['position'] = 'Proximal'
D367_Biop_Pro1.obs['method'] = 'Biopsy'
D367_Biop_Pro1.obs['donor'] = 'D367'
D367_Biop_Pro1.obs['name'] = ['D367_Biop_Pro1_' + s for s in list(D367_Biop_Pro1.obs.index)]
D367_Biop_Pro1.obs_names = D367_Biop_Pro1.obs['name']
D367_Biop_Pro1
... reading from cache file ./cache/D367_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 3180 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D367_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=0)
mito_genes = D367_Biop_Pro1.var_names.str.startswith('MT-')
D367_Biop_Pro1.obs['percent_mito'] = np.sum(
D367_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.obs['n_counts'] = D367_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Pro1.to_df())
ribo_genes = D367_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Pro1.obs['percent_ribo'] = np.sum(
D367_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=500)
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['n_counts'] < 30000, :]
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['percent_mito'] < 0.4, :]
filtered out 7 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D367_Biop_Pro1.X, expected_doublet_rate=0.024)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D367_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.30 Detected doublet rate = 0.6% Estimated detectable doublet fraction = 16.0% Overall doublet rate: Expected = 2.4% Estimated = 3.8% Elapsed time: 2.4 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb62285c0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb62ea748>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Biop_Pro1.X).predict()
D367_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8105325698852539 seconds Jaccard graph constructed in 0.5572328567504883 seconds Wrote graph to binary file in 0.05981707572937012 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920174 Louvain completed 21 runs in 1.4654748439788818 seconds PhenoGraph complete in 2.9070277214050293 seconds Found communities [-1, ... 21], with sizes: [194, 1216, 420, 238, 223, 212, 167, 158, 137, 115, 108, 94, 92, 85, 83, 80, 74, 53, 52, 49, 40, 35, 23] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7097203731536865 seconds Jaccard graph constructed in 0.5629580020904541 seconds Wrote graph to binary file in 0.25125646591186523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.919261 After 11 runs, maximum modularity is Q = 0.920274 Louvain completed 31 runs in 2.2788219451904297 seconds PhenoGraph complete in 3.817668914794922 seconds Found communities [-1, ... 23], with sizes: [194, 1171, 421, 266, 232, 198, 179, 154, 150, 116, 109, 92, 90, 85, 85, 83, 54, 52, 47, 41, 40, 37, 23, 16, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9105391502380371 seconds Jaccard graph constructed in 0.5836596488952637 seconds Wrote graph to binary file in 0.06721949577331543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.923622 Louvain completed 21 runs in 1.4665532112121582 seconds PhenoGraph complete in 3.043471574783325 seconds Found communities [-1, ... 23], with sizes: [195, 1131, 438, 244, 233, 228, 184, 173, 141, 123, 121, 106, 91, 79, 71, 57, 56, 55, 49, 45, 43, 37, 23, 14, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8099977970123291 seconds Jaccard graph constructed in 0.5825245380401611 seconds Wrote graph to binary file in 0.23905181884765625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.92033 Louvain completed 21 runs in 1.4683146476745605 seconds PhenoGraph complete in 3.1150622367858887 seconds Found communities [-1, ... 22], with sizes: [187, 1154, 431, 348, 237, 224, 172, 168, 140, 112, 105, 94, 86, 76, 71, 59, 54, 50, 49, 43, 36, 24, 15, 13] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8167722225189209 seconds Jaccard graph constructed in 0.5718975067138672 seconds Wrote graph to binary file in 0.06685614585876465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920141 Louvain completed 21 runs in 1.4451842308044434 seconds PhenoGraph complete in 2.9165916442871094 seconds Found communities [-1, ... 22], with sizes: [175, 1193, 421, 264, 242, 221, 175, 145, 135, 109, 107, 105, 92, 89, 81, 72, 62, 56, 51, 46, 35, 29, 24, 19] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8103835582733154 seconds Jaccard graph constructed in 0.5667843818664551 seconds Wrote graph to binary file in 0.21831989288330078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.92355 Louvain completed 21 runs in 1.455714464187622 seconds PhenoGraph complete in 3.066343069076538 seconds Found communities [-1, ... 22], with sizes: [196, 1199, 444, 239, 228, 217, 166, 148, 127, 124, 108, 97, 87, 82, 80, 74, 71, 54, 52, 45, 45, 29, 24, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8093373775482178 seconds Jaccard graph constructed in 0.5747344493865967 seconds Wrote graph to binary file in 0.06644582748413086 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922434 Louvain completed 21 runs in 1.509354829788208 seconds PhenoGraph complete in 2.972931146621704 seconds Found communities [-1, ... 22], with sizes: [189, 1165, 408, 323, 244, 170, 166, 155, 152, 121, 105, 91, 91, 88, 84, 83, 60, 55, 43, 41, 41, 35, 24, 14] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.909996747970581 seconds Jaccard graph constructed in 0.6303787231445312 seconds Wrote graph to binary file in 0.23954415321350098 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.921651 Louvain completed 21 runs in 1.4474704265594482 seconds PhenoGraph complete in 3.2454562187194824 seconds Found communities [-1, ... 21], with sizes: [184, 1208, 414, 315, 251, 217, 167, 165, 152, 114, 107, 91, 88, 85, 73, 60, 56, 48, 46, 40, 30, 24, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.810338020324707 seconds Jaccard graph constructed in 0.5772206783294678 seconds Wrote graph to binary file in 0.06661152839660645 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917956 After 20 runs, maximum modularity is Q = 0.918981 Louvain completed 40 runs in 2.744997024536133 seconds PhenoGraph complete in 4.215928077697754 seconds Found communities [-1, ... 21], with sizes: [200, 1203, 420, 267, 234, 219, 170, 167, 137, 114, 105, 92, 86, 82, 73, 64, 59, 58, 51, 48, 40, 36, 23] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9109387397766113 seconds Jaccard graph constructed in 0.5822474956512451 seconds Wrote graph to binary file in 0.24210119247436523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918845 Louvain completed 21 runs in 1.4738397598266602 seconds PhenoGraph complete in 3.226494550704956 seconds Found communities [-1, ... 21], with sizes: [193, 1237, 419, 243, 226, 223, 193, 172, 161, 136, 107, 93, 84, 69, 67, 60, 59, 51, 51, 37, 30, 23, 14] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8096990585327148 seconds Jaccard graph constructed in 0.5590062141418457 seconds Wrote graph to binary file in 0.06928634643554688 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922123 Louvain completed 21 runs in 1.453864336013794 seconds PhenoGraph complete in 2.907410144805908 seconds Found communities [-1, ... 21], with sizes: [188, 1156, 470, 432, 330, 171, 141, 135, 123, 108, 90, 87, 86, 74, 67, 67, 46, 45, 44, 29, 22, 22, 15] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9095442295074463 seconds Jaccard graph constructed in 0.5906589031219482 seconds Wrote graph to binary file in 0.23798322677612305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920328 Louvain completed 21 runs in 1.4951162338256836 seconds PhenoGraph complete in 3.248774290084839 seconds Found communities [-1, ... 21], with sizes: [185, 1191, 453, 341, 236, 234, 143, 143, 141, 118, 107, 91, 88, 78, 73, 54, 51, 47, 46, 42, 35, 34, 17] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8095653057098389 seconds Jaccard graph constructed in 0.5797111988067627 seconds Wrote graph to binary file in 0.06597495079040527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.921981 Louvain completed 21 runs in 1.5110702514648438 seconds PhenoGraph complete in 2.9795186519622803 seconds Found communities [-1, ... 22], with sizes: [140, 1230, 444, 306, 226, 223, 165, 158, 147, 126, 111, 93, 90, 81, 79, 62, 53, 45, 42, 39, 35, 23, 17, 13] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.809781551361084 seconds Jaccard graph constructed in 0.7734463214874268 seconds Wrote graph to binary file in 0.065277099609375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.921835 Louvain completed 21 runs in 1.4789063930511475 seconds PhenoGraph complete in 3.143568754196167 seconds Found communities [-1, ... 23], with sizes: [192, 1196, 402, 332, 248, 214, 162, 144, 141, 133, 106, 93, 90, 81, 72, 56, 49, 47, 45, 39, 36, 22, 20, 17, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8100326061248779 seconds Jaccard graph constructed in 0.5675520896911621 seconds Wrote graph to binary file in 0.21927332878112793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.917905 After 7 runs, maximum modularity is Q = 0.91893 Louvain completed 27 runs in 2.031409978866577 seconds PhenoGraph complete in 3.6419076919555664 seconds Found communities [-1, ... 22], with sizes: [204, 1191, 412, 336, 214, 203, 197, 170, 127, 108, 106, 92, 86, 83, 79, 52, 51, 44, 43, 43, 37, 35, 24, 11] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8096253871917725 seconds Jaccard graph constructed in 0.5573554039001465 seconds Wrote graph to binary file in 0.06541252136230469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.92422 Louvain completed 21 runs in 1.4452247619628906 seconds PhenoGraph complete in 2.8921914100646973 seconds Found communities [-1, ... 23], with sizes: [193, 1171, 440, 334, 248, 220, 154, 151, 143, 113, 105, 93, 91, 76, 74, 62, 58, 51, 41, 34, 32, 23, 18, 12, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8086884021759033 seconds Jaccard graph constructed in 0.5715727806091309 seconds Wrote graph to binary file in 0.06446361541748047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922189 Louvain completed 21 runs in 1.492135763168335 seconds PhenoGraph complete in 2.94960618019104 seconds Found communities [-1, ... 24], with sizes: [180, 1212, 435, 296, 240, 174, 150, 143, 131, 121, 106, 93, 87, 85, 84, 82, 62, 52, 45, 42, 40, 28, 23, 13, 12, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8086469173431396 seconds Jaccard graph constructed in 0.7175347805023193 seconds Wrote graph to binary file in 0.0649256706237793 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.924324 After 4 runs, maximum modularity is Q = 0.925447 Louvain completed 24 runs in 1.8387408256530762 seconds PhenoGraph complete in 3.444546937942505 seconds Found communities [-1, ... 22], with sizes: [160, 1243, 446, 294, 234, 227, 162, 155, 131, 117, 108, 91, 89, 80, 80, 61, 57, 54, 41, 34, 29, 23, 19, 13] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8097279071807861 seconds Jaccard graph constructed in 0.5598583221435547 seconds Wrote graph to binary file in 0.22486257553100586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918037 Louvain completed 21 runs in 1.5992670059204102 seconds PhenoGraph complete in 3.2090132236480713 seconds Found communities [-1, ... 22], with sizes: [185, 1207, 432, 257, 257, 231, 148, 137, 137, 127, 107, 93, 89, 81, 78, 67, 67, 54, 47, 41, 39, 34, 22, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.810072660446167 seconds Jaccard graph constructed in 0.5778443813323975 seconds Wrote graph to binary file in 0.06517863273620605 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922872 Louvain completed 21 runs in 1.4541747570037842 seconds PhenoGraph complete in 2.9231276512145996 seconds Found communities [-1, ... 25], with sizes: [178, 1192, 426, 234, 223, 199, 156, 155, 146, 118, 108, 105, 93, 85, 84, 72, 70, 60, 47, 44, 39, 28, 23, 22, 17, 13, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9112532138824463 seconds Jaccard graph constructed in 0.5815844535827637 seconds Wrote graph to binary file in 0.06464576721191406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.920535 After 5 runs, maximum modularity is Q = 0.921564 Louvain completed 25 runs in 1.9627289772033691 seconds PhenoGraph complete in 3.5345304012298584 seconds Found communities [-1, ... 23], with sizes: [179, 1192, 430, 255, 246, 241, 164, 143, 142, 124, 107, 91, 84, 77, 72, 72, 56, 53, 46, 43, 41, 35, 23, 17, 15] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8093485832214355 seconds Jaccard graph constructed in 0.7571942806243896 seconds Wrote graph to binary file in 0.06494641304016113 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.918662 Louvain completed 21 runs in 1.488870620727539 seconds PhenoGraph complete in 3.135673761367798 seconds Found communities [-1, ... 21], with sizes: [220, 1199, 429, 308, 252, 222, 153, 148, 136, 121, 107, 92, 83, 81, 79, 70, 58, 40, 40, 38, 38, 23, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9098646640777588 seconds Jaccard graph constructed in 0.5914657115936279 seconds Wrote graph to binary file in 0.22304725646972656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922815 After 4 runs, maximum modularity is Q = 0.923905 Louvain completed 24 runs in 1.8239877223968506 seconds PhenoGraph complete in 3.5645148754119873 seconds Found communities [-1, ... 22], with sizes: [195, 1211, 411, 249, 227, 225, 173, 153, 141, 120, 105, 93, 92, 79, 79, 73, 69, 49, 45, 43, 40, 39, 23, 14] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9108030796051025 seconds Jaccard graph constructed in 0.5746753215789795 seconds Wrote graph to binary file in 0.06561899185180664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.923058 Louvain completed 21 runs in 1.4927396774291992 seconds PhenoGraph complete in 3.0595078468322754 seconds Found communities [-1, ... 22], with sizes: [203, 1185, 453, 304, 278, 166, 162, 146, 143, 131, 106, 90, 81, 73, 68, 67, 64, 56, 46, 40, 34, 23, 18, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9106283187866211 seconds Jaccard graph constructed in 0.5874824523925781 seconds Wrote graph to binary file in 0.06527256965637207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.922426 Louvain completed 21 runs in 1.42024564743042 seconds PhenoGraph complete in 2.999218702316284 seconds Found communities [-1, ... 22], with sizes: [217, 1200, 409, 256, 225, 215, 168, 161, 136, 116, 111, 92, 91, 88, 76, 74, 54, 50, 50, 47, 40, 38, 23, 11]
sc.pp.normalize_per_cell(D367_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Pro1) # log transform the data
D367_Biop_Pro1.raw = D367_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D367_Biop_Pro1 = D367_Biop_Pro1[:, D367_Biop_Pro1.var['ribo_genes']]
D367_Biop_Pro1
View of AnnData object with n_obs × n_vars = 3159 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D372_Biop_Pro1 = sc.read_10x_mtx(
'./D372_Biop_Pro1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Pro1.var_names_make_unique()
D372_Biop_Pro1.obs['manip'] = 'D372_Biop_Pro1'
D372_Biop_Pro1.obs['position'] = 'Proximal'
D372_Biop_Pro1.obs['method'] = 'Biopsy'
D372_Biop_Pro1.obs['donor'] = 'D372'
D372_Biop_Pro1.obs['name'] = ['D372_Biop_Pro1_' + s for s in list(D372_Biop_Pro1.obs.index)]
D372_Biop_Pro1.obs_names = D372_Biop_Pro1.obs['name']
D372_Biop_Pro1
... reading from cache file ./cache/D372_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 4585 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D372_Biop_Pro1, n_top=20)
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=0)
mito_genes = D372_Biop_Pro1.var_names.str.startswith('MT-')
D372_Biop_Pro1.obs['percent_mito'] = np.sum(
D372_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.obs['n_counts'] = D372_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Pro1.to_df())
ribo_genes = D372_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Pro1.obs['percent_ribo'] = np.sum(
D372_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=500)
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['n_counts'] < 30000, :]
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['percent_mito'] < 0.3, :]
filtered out 4 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D372_Biop_Pro1.X, expected_doublet_rate=0.038)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D372_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/scrublet/helper_functions.py:238: RuntimeWarning: invalid value encountered in log gLog = lambda input: np.log(input[1] * np.exp(-input[0]) + input[2])
Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.45 Detected doublet rate = 0.3% Estimated detectable doublet fraction = 7.3% Overall doublet rate: Expected = 3.8% Estimated = 4.2% Elapsed time: 4.0 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb663b198>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb46e2ac8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Pro1.X).predict()
D372_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9112594127655029 seconds Jaccard graph constructed in 0.7462267875671387 seconds Wrote graph to binary file in 0.28792905807495117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928885 Louvain completed 21 runs in 1.7388286590576172 seconds PhenoGraph complete in 3.702601909637451 seconds Found communities [-1, ... 23], with sizes: [238, 1608, 936, 499, 469, 425, 221, 191, 121, 120, 109, 105, 100, 92, 78, 66, 64, 54, 50, 41, 39, 34, 25, 21, 11] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6132426261901855 seconds Jaccard graph constructed in 0.7779467105865479 seconds Wrote graph to binary file in 0.28315186500549316 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.930108 Louvain completed 21 runs in 1.9567527770996094 seconds PhenoGraph complete in 4.653115749359131 seconds Found communities [-1, ... 22], with sizes: [233, 1628, 953, 491, 481, 459, 216, 167, 126, 123, 104, 102, 93, 85, 82, 69, 64, 50, 48, 43, 34, 31, 21, 14] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1129202842712402 seconds Jaccard graph constructed in 0.7580430507659912 seconds Wrote graph to binary file in 0.09833741188049316 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.924756 Louvain completed 21 runs in 1.9004945755004883 seconds PhenoGraph complete in 3.8885812759399414 seconds Found communities [-1, ... 24], with sizes: [240, 1644, 921, 509, 436, 430, 223, 196, 127, 125, 110, 102, 95, 90, 89, 65, 64, 50, 41, 31, 30, 27, 22, 22, 17, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.7137095928192139 seconds Jaccard graph constructed in 1.0560684204101562 seconds Wrote graph to binary file in 0.09852480888366699 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.929801 Louvain completed 21 runs in 1.9685957431793213 seconds PhenoGraph complete in 4.861473083496094 seconds Found communities [-1, ... 25], with sizes: [192, 1539, 841, 543, 462, 411, 206, 206, 161, 138, 131, 106, 103, 97, 94, 69, 68, 65, 59, 42, 35, 33, 32, 29, 23, 20, 12] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6131210327148438 seconds Jaccard graph constructed in 0.7775664329528809 seconds Wrote graph to binary file in 0.28657078742980957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927946 Louvain completed 21 runs in 1.811650276184082 seconds PhenoGraph complete in 4.507956266403198 seconds Found communities [-1, ... 23], with sizes: [218, 1669, 832, 537, 443, 438, 232, 167, 148, 106, 106, 101, 98, 84, 80, 75, 67, 66, 50, 47, 41, 34, 33, 32, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6162004470825195 seconds Jaccard graph constructed in 0.7626640796661377 seconds Wrote graph to binary file in 0.30016422271728516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927583 Louvain completed 21 runs in 1.8646981716156006 seconds PhenoGraph complete in 4.562420606613159 seconds Found communities [-1, ... 22], with sizes: [225, 1552, 959, 539, 503, 408, 230, 169, 132, 110, 109, 104, 103, 102, 94, 65, 64, 53, 48, 41, 36, 35, 22, 14] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.816399097442627 seconds Jaccard graph constructed in 0.7958872318267822 seconds Wrote graph to binary file in 0.09909796714782715 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927359 Louvain completed 21 runs in 1.8575026988983154 seconds PhenoGraph complete in 4.594034671783447 seconds Found communities [-1, ... 23], with sizes: [218, 1611, 970, 495, 469, 411, 222, 209, 124, 111, 106, 104, 97, 97, 89, 65, 63, 57, 44, 41, 35, 24, 22, 20, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.815995216369629 seconds Jaccard graph constructed in 0.762723445892334 seconds Wrote graph to binary file in 0.2827150821685791 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.930099 Louvain completed 21 runs in 1.9286956787109375 seconds PhenoGraph complete in 4.808043003082275 seconds Found communities [-1, ... 24], with sizes: [226, 1632, 903, 521, 460, 444, 214, 159, 122, 120, 114, 106, 102, 94, 88, 71, 65, 62, 41, 38, 33, 24, 24, 22, 21, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.5132901668548584 seconds Jaccard graph constructed in 0.7941238880157471 seconds Wrote graph to binary file in 0.2744612693786621 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.926996 Louvain completed 21 runs in 1.96500825881958 seconds PhenoGraph complete in 4.569213628768921 seconds Found communities [-1, ... 25], with sizes: [241, 1566, 864, 541, 467, 418, 237, 198, 131, 116, 109, 106, 100, 86, 78, 66, 64, 63, 50, 42, 37, 31, 28, 23, 23, 20, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.511993646621704 seconds Jaccard graph constructed in 0.7374269962310791 seconds Wrote graph to binary file in 0.09723782539367676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.9266 Louvain completed 21 runs in 1.897585391998291 seconds PhenoGraph complete in 4.262134790420532 seconds Found communities [-1, ... 23], with sizes: [201, 1660, 897, 498, 495, 436, 221, 179, 127, 113, 108, 107, 104, 103, 79, 69, 64, 50, 50, 42, 37, 31, 21, 13, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6146581172943115 seconds Jaccard graph constructed in 0.756951093673706 seconds Wrote graph to binary file in 0.0980679988861084 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.929866 Louvain completed 21 runs in 1.9732599258422852 seconds PhenoGraph complete in 4.465601205825806 seconds Found communities [-1, ... 26], with sizes: [222, 1672, 780, 491, 468, 419, 220, 152, 128, 118, 118, 104, 102, 98, 90, 76, 71, 67, 59, 54, 43, 32, 31, 31, 23, 20, 17, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6129565238952637 seconds Jaccard graph constructed in 0.747774600982666 seconds Wrote graph to binary file in 0.2811613082885742 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928524 Louvain completed 21 runs in 1.9010710716247559 seconds PhenoGraph complete in 4.561993837356567 seconds Found communities [-1, ... 25], with sizes: [197, 1595, 871, 588, 467, 413, 219, 142, 131, 128, 122, 105, 104, 93, 85, 67, 67, 53, 49, 42, 38, 36, 30, 23, 22, 17, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6130390167236328 seconds Jaccard graph constructed in 0.7356076240539551 seconds Wrote graph to binary file in 0.2615077495574951 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928517 Louvain completed 21 runs in 1.8789329528808594 seconds PhenoGraph complete in 4.506845712661743 seconds Found communities [-1, ... 22], with sizes: [236, 1633, 883, 538, 464, 426, 221, 193, 129, 114, 110, 105, 100, 87, 78, 72, 65, 56, 50, 41, 37, 33, 26, 20] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6142187118530273 seconds Jaccard graph constructed in 0.7434346675872803 seconds Wrote graph to binary file in 0.09998083114624023 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928991 Louvain completed 21 runs in 1.9372618198394775 seconds PhenoGraph complete in 4.413788080215454 seconds Found communities [-1, ... 23], with sizes: [190, 1597, 912, 539, 509, 405, 214, 186, 131, 122, 113, 104, 102, 94, 88, 71, 67, 53, 47, 43, 40, 31, 29, 17, 13] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1131341457366943 seconds Jaccard graph constructed in 1.018934726715088 seconds Wrote graph to binary file in 0.11078023910522461 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.931325 Louvain completed 21 runs in 2.0481789112091064 seconds PhenoGraph complete in 4.313236474990845 seconds Found communities [-1, ... 23], with sizes: [213, 1547, 936, 533, 454, 445, 222, 217, 127, 119, 109, 107, 103, 95, 84, 65, 65, 58, 41, 39, 35, 34, 29, 23, 17] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.1158053874969482 seconds Jaccard graph constructed in 0.7455377578735352 seconds Wrote graph to binary file in 0.2893695831298828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927404 Louvain completed 21 runs in 1.8686316013336182 seconds PhenoGraph complete in 4.04028582572937 seconds Found communities [-1, ... 24], with sizes: [217, 1606, 912, 539, 506, 415, 206, 195, 112, 112, 108, 104, 97, 90, 85, 68, 65, 56, 43, 35, 33, 32, 31, 23, 16, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.619184970855713 seconds Jaccard graph constructed in 0.7403817176818848 seconds Wrote graph to binary file in 0.2953202724456787 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927235 Louvain completed 21 runs in 1.9175662994384766 seconds PhenoGraph complete in 4.594003200531006 seconds Found communities [-1, ... 24], with sizes: [236, 1644, 928, 492, 477, 431, 219, 155, 123, 112, 104, 104, 100, 100, 88, 75, 68, 47, 40, 40, 34, 27, 23, 22, 17, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6144399642944336 seconds Jaccard graph constructed in 0.7484476566314697 seconds Wrote graph to binary file in 0.09547019004821777 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928154 After 8 runs, maximum modularity is Q = 0.929179 Louvain completed 28 runs in 2.6022768020629883 seconds PhenoGraph complete in 5.083362340927124 seconds Found communities [-1, ... 24], with sizes: [215, 1648, 809, 514, 457, 443, 250, 160, 131, 129, 110, 104, 92, 90, 84, 76, 66, 65, 54, 46, 42, 37, 32, 26, 22, 15] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.5127384662628174 seconds Jaccard graph constructed in 0.7312557697296143 seconds Wrote graph to binary file in 0.27966833114624023 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.930157 Louvain completed 21 runs in 1.9421899318695068 seconds PhenoGraph complete in 4.4853222370147705 seconds Found communities [-1, ... 24], with sizes: [233, 1604, 842, 501, 479, 411, 217, 212, 127, 117, 111, 111, 103, 98, 96, 74, 67, 61, 52, 42, 37, 33, 25, 23, 22, 19] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6139726638793945 seconds Jaccard graph constructed in 0.7577183246612549 seconds Wrote graph to binary file in 0.2873423099517822 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927316 Louvain completed 21 runs in 1.8524832725524902 seconds PhenoGraph complete in 4.533081531524658 seconds Found communities [-1, ... 23], with sizes: [211, 1728, 812, 493, 470, 429, 234, 162, 140, 131, 113, 107, 105, 100, 82, 66, 66, 58, 49, 41, 34, 32, 23, 19, 12] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6131353378295898 seconds Jaccard graph constructed in 0.73968505859375 seconds Wrote graph to binary file in 0.09642791748046875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.926533 Louvain completed 21 runs in 1.874847412109375 seconds PhenoGraph complete in 4.343479633331299 seconds Found communities [-1, ... 24], with sizes: [237, 1611, 901, 518, 458, 435, 214, 200, 133, 113, 108, 104, 102, 93, 87, 67, 67, 56, 42, 41, 31, 29, 22, 21, 15, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.614177942276001 seconds Jaccard graph constructed in 0.7559611797332764 seconds Wrote graph to binary file in 0.2796041965484619 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.926431 Louvain completed 21 runs in 1.8801405429840088 seconds PhenoGraph complete in 4.560997247695923 seconds Found communities [-1, ... 24], with sizes: [202, 1700, 824, 503, 462, 426, 218, 188, 122, 120, 108, 107, 102, 90, 84, 77, 67, 64, 51, 44, 42, 38, 32, 19, 16, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.5149061679840088 seconds Jaccard graph constructed in 0.7289752960205078 seconds Wrote graph to binary file in 0.0992286205291748 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.930077 Louvain completed 21 runs in 1.957927942276001 seconds PhenoGraph complete in 4.319796562194824 seconds Found communities [-1, ... 25], with sizes: [204, 1611, 781, 530, 490, 430, 228, 185, 171, 120, 112, 110, 105, 104, 97, 67, 65, 49, 42, 42, 34, 34, 31, 25, 22, 15, 13] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.6140358448028564 seconds Jaccard graph constructed in 0.7665765285491943 seconds Wrote graph to binary file in 0.281707763671875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927381 Louvain completed 21 runs in 1.8730874061584473 seconds PhenoGraph complete in 4.557349443435669 seconds Found communities [-1, ... 24], with sizes: [223, 1607, 881, 503, 460, 425, 222, 217, 155, 132, 103, 101, 95, 93, 81, 70, 65, 53, 51, 43, 34, 32, 23, 19, 16, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.614048719406128 seconds Jaccard graph constructed in 0.7313416004180908 seconds Wrote graph to binary file in 0.28035402297973633 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.927283 Louvain completed 21 runs in 1.9604082107543945 seconds PhenoGraph complete in 4.606944561004639 seconds Found communities [-1, ... 23], with sizes: [199, 1592, 967, 497, 480, 457, 226, 168, 122, 108, 107, 105, 104, 103, 99, 65, 65, 50, 42, 39, 32, 27, 23, 22, 18]
sc.pp.normalize_per_cell(D372_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Pro1) # log transform the data
D372_Biop_Pro1.raw = D372_Biop_Pro1 # freeze the object (for later use of the raw state of it)
D372_Biop_Pro1 = D372_Biop_Pro1[:, D372_Biop_Pro1.var['ribo_genes']]
D372_Biop_Pro1
View of AnnData object with n_obs × n_vars = 4574 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D322_Biop_Int1 = sc.read_10x_mtx(
'./D322_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D322_Biop_Int1.var_names_make_unique()
D322_Biop_Int1.obs['manip'] = 'D322_Biop_Int1'
D322_Biop_Int1.obs['position'] = 'Intermediate'
D322_Biop_Int1.obs['method'] = 'Biopsy'
D322_Biop_Int1.obs['donor'] = 'D322'
D322_Biop_Int1.obs['name'] = ['D322_Biop_Int1_' + s for s in list(D322_Biop_Int1.obs.index)]
D322_Biop_Int1.obs_names = D322_Biop_Int1.obs['name']
D322_Biop_Int1
... reading from cache file ./cache/D322_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1923 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D322_Biop_Int1, n_top=20)
sc.pp.filter_cells(D322_Biop_Int1, min_genes=0)
mito_genes = D322_Biop_Int1.var_names.str.startswith('MT-')
D322_Biop_Int1.obs['percent_mito'] = np.sum(
D322_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.obs['n_counts'] = D322_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Int1.to_df())
ribo_genes = D322_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Int1.obs['percent_ribo'] = np.sum(
D322_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D322_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D322_Biop_Int1, min_genes=500)
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['n_counts'] < 20000, :]
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['percent_mito'] < 0.2 , :]
filtered out 64 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D322_Biop_Int1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Int1.obs['doublet_scores'] = doublet_scores
D322_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.18 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 16.4% Overall doublet rate: Expected = 1.6% Estimated = 2.6% Elapsed time: 0.9 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb5251908>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea99d1ac8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Int1.X).predict()
D322_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4068300724029541 seconds Jaccard graph constructed in 0.4693014621734619 seconds Wrote graph to binary file in 0.031632184982299805 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87247 After 12 runs, maximum modularity is Q = 0.873539 Louvain completed 32 runs in 1.9383304119110107 seconds PhenoGraph complete in 2.8590123653411865 seconds Found communities [-1, ... 16], with sizes: [201, 587, 261, 196, 170, 154, 117, 111, 88, 75, 68, 59, 55, 53, 49, 32, 23, 22] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40746402740478516 seconds Jaccard graph constructed in 0.44730067253112793 seconds Wrote graph to binary file in 0.2316131591796875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868111 Louvain completed 21 runs in 1.2237942218780518 seconds PhenoGraph complete in 2.320791244506836 seconds Found communities [-1, ... 18], with sizes: [211, 752, 264, 146, 128, 122, 100, 93, 80, 76, 72, 50, 48, 45, 36, 29, 25, 19, 13, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20693516731262207 seconds Jaccard graph constructed in 0.4806842803955078 seconds Wrote graph to binary file in 0.04390215873718262 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871274 Louvain completed 21 runs in 1.2296905517578125 seconds PhenoGraph complete in 1.9819493293762207 seconds Found communities [-1, ... 15], with sizes: [239, 467, 265, 259, 197, 168, 104, 95, 94, 93, 88, 64, 50, 47, 39, 31, 21] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4074211120605469 seconds Jaccard graph constructed in 0.5327847003936768 seconds Wrote graph to binary file in 0.03722238540649414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.862273 After 6 runs, maximum modularity is Q = 0.864137 Louvain completed 26 runs in 1.7315800189971924 seconds PhenoGraph complete in 2.7248294353485107 seconds Found communities [-1, ... 18], with sizes: [164, 465, 272, 201, 184, 178, 115, 110, 106, 94, 87, 72, 66, 48, 45, 45, 20, 20, 17, 12] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40697407722473145 seconds Jaccard graph constructed in 0.5319037437438965 seconds Wrote graph to binary file in 0.03862738609313965 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869802 Louvain completed 21 runs in 1.2083230018615723 seconds PhenoGraph complete in 2.198496103286743 seconds Found communities [-1, ... 16], with sizes: [208, 686, 284, 225, 162, 139, 109, 86, 74, 63, 56, 53, 50, 46, 32, 20, 17, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20621824264526367 seconds Jaccard graph constructed in 0.5238604545593262 seconds Wrote graph to binary file in 0.22135210037231445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869009 After 17 runs, maximum modularity is Q = 0.870246 Louvain completed 37 runs in 2.1698074340820312 seconds PhenoGraph complete in 3.132782220840454 seconds Found communities [-1, ... 15], with sizes: [211, 716, 249, 184, 175, 110, 102, 94, 88, 73, 71, 59, 57, 45, 45, 21, 21] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4072568416595459 seconds Jaccard graph constructed in 0.49279093742370605 seconds Wrote graph to binary file in 0.047395944595336914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870167 After 11 runs, maximum modularity is Q = 0.871219 Louvain completed 31 runs in 1.9473226070404053 seconds PhenoGraph complete in 2.9103283882141113 seconds Found communities [-1, ... 17], with sizes: [161, 706, 253, 210, 200, 126, 96, 87, 77, 72, 64, 58, 51, 48, 45, 22, 19, 14, 12] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20772147178649902 seconds Jaccard graph constructed in 0.47537732124328613 seconds Wrote graph to binary file in 0.05222797393798828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868438 Louvain completed 21 runs in 1.2413804531097412 seconds PhenoGraph complete in 1.9910881519317627 seconds Found communities [-1, ... 14], with sizes: [217, 404, 396, 257, 203, 176, 105, 96, 91, 84, 62, 56, 52, 50, 48, 24] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40869641304016113 seconds Jaccard graph constructed in 0.47931933403015137 seconds Wrote graph to binary file in 0.04055356979370117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873188 Louvain completed 21 runs in 1.2273988723754883 seconds PhenoGraph complete in 2.1766934394836426 seconds Found communities [-1, ... 17], with sizes: [209, 463, 281, 224, 186, 126, 107, 106, 99, 87, 77, 62, 61, 56, 48, 46, 44, 26, 13] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4078657627105713 seconds Jaccard graph constructed in 0.48507189750671387 seconds Wrote graph to binary file in 0.2467958927154541 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872083 Louvain completed 21 runs in 1.2212657928466797 seconds PhenoGraph complete in 2.375140428543091 seconds Found communities [-1, ... 16], with sizes: [167, 492, 283, 252, 201, 167, 131, 91, 88, 81, 64, 59, 55, 54, 49, 47, 29, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2082667350769043 seconds Jaccard graph constructed in 0.5225830078125 seconds Wrote graph to binary file in 0.036817073822021484 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870846 Louvain completed 21 runs in 1.2443389892578125 seconds PhenoGraph complete in 2.023336172103882 seconds Found communities [-1, ... 18], with sizes: [176, 522, 263, 223, 168, 166, 114, 97, 93, 90, 83, 66, 57, 50, 50, 40, 23, 17, 12, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40749216079711914 seconds Jaccard graph constructed in 0.4809436798095703 seconds Wrote graph to binary file in 0.05436515808105469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872696 Louvain completed 21 runs in 1.2257423400878906 seconds PhenoGraph complete in 2.1857211589813232 seconds Found communities [-1, ... 15], with sizes: [177, 683, 258, 247, 211, 144, 92, 89, 83, 71, 50, 48, 43, 43, 41, 22, 19] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20770573616027832 seconds Jaccard graph constructed in 0.549626350402832 seconds Wrote graph to binary file in 0.03781294822692871 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873802 After 3 runs, maximum modularity is Q = 0.875393 Louvain completed 23 runs in 1.5214858055114746 seconds PhenoGraph complete in 2.3344552516937256 seconds Found communities [-1, ... 15], with sizes: [214, 714, 266, 213, 178, 102, 97, 93, 89, 69, 61, 59, 47, 46, 38, 19, 16] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40694093704223633 seconds Jaccard graph constructed in 0.5209157466888428 seconds Wrote graph to binary file in 0.03594565391540527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871577 Louvain completed 21 runs in 1.2233171463012695 seconds PhenoGraph complete in 2.200096607208252 seconds Found communities [-1, ... 15], with sizes: [207, 762, 250, 182, 179, 151, 87, 85, 63, 62, 61, 53, 50, 47, 35, 24, 23] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20731258392333984 seconds Jaccard graph constructed in 0.561589241027832 seconds Wrote graph to binary file in 0.28632497787475586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87727 Louvain completed 21 runs in 1.36537504196167 seconds PhenoGraph complete in 2.4366579055786133 seconds Found communities [-1, ... 18], with sizes: [174, 537, 258, 211, 187, 178, 110, 101, 91, 90, 80, 62, 61, 45, 44, 32, 24, 13, 12, 11] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41071152687072754 seconds Jaccard graph constructed in 0.5380644798278809 seconds Wrote graph to binary file in 0.037424564361572266 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868076 After 6 runs, maximum modularity is Q = 0.869086 Louvain completed 26 runs in 1.6808693408966064 seconds PhenoGraph complete in 2.6815407276153564 seconds Found communities [-1, ... 17], with sizes: [173, 588, 265, 197, 188, 170, 110, 109, 89, 86, 71, 65, 50, 45, 44, 26, 22, 12, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4077441692352295 seconds Jaccard graph constructed in 0.47994494438171387 seconds Wrote graph to binary file in 0.039659738540649414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878816 Louvain completed 21 runs in 1.2495367527008057 seconds PhenoGraph complete in 2.202623128890991 seconds Found communities [-1, ... 17], with sizes: [192, 518, 232, 198, 182, 172, 124, 117, 111, 83, 78, 70, 56, 54, 46, 36, 24, 17, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20657849311828613 seconds Jaccard graph constructed in 0.47501611709594727 seconds Wrote graph to binary file in 0.03810620307922363 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868923 After 5 runs, maximum modularity is Q = 0.870273 Louvain completed 25 runs in 1.6174962520599365 seconds PhenoGraph complete in 2.376770496368408 seconds Found communities [-1, ... 17], with sizes: [171, 558, 277, 203, 177, 174, 172, 93, 83, 71, 58, 51, 49, 47, 45, 45, 21, 14, 12] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40731334686279297 seconds Jaccard graph constructed in 0.47495150566101074 seconds Wrote graph to binary file in 0.039067745208740234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868957 Louvain completed 21 runs in 1.277984857559204 seconds PhenoGraph complete in 2.2233362197875977 seconds Found communities [-1, ... 16], with sizes: [192, 713, 325, 217, 116, 114, 86, 79, 70, 68, 63, 55, 47, 47, 46, 46, 22, 15] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20666027069091797 seconds Jaccard graph constructed in 0.46600866317749023 seconds Wrote graph to binary file in 0.27489566802978516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870521 After 2 runs, maximum modularity is Q = 0.871618 Louvain completed 22 runs in 1.5180156230926514 seconds PhenoGraph complete in 2.479268789291382 seconds Found communities [-1, ... 17], with sizes: [154, 770, 228, 198, 165, 140, 94, 86, 77, 69, 65, 59, 54, 52, 45, 23, 20, 11, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4068617820739746 seconds Jaccard graph constructed in 0.4757375717163086 seconds Wrote graph to binary file in 0.05712580680847168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872589 After 6 runs, maximum modularity is Q = 0.874023 Louvain completed 26 runs in 1.664569616317749 seconds PhenoGraph complete in 2.617093086242676 seconds Found communities [-1, ... 19], with sizes: [190, 606, 239, 235, 108, 107, 102, 96, 94, 89, 78, 75, 57, 56, 48, 46, 35, 22, 14, 12, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2073688507080078 seconds Jaccard graph constructed in 0.5114006996154785 seconds Wrote graph to binary file in 0.0673227310180664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868407 After 3 runs, maximum modularity is Q = 0.87025 Louvain completed 23 runs in 1.7132079601287842 seconds PhenoGraph complete in 2.5164921283721924 seconds Found communities [-1, ... 16], with sizes: [207, 566, 238, 203, 201, 144, 117, 109, 92, 88, 65, 62, 54, 51, 49, 43, 21, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20794010162353516 seconds Jaccard graph constructed in 0.5248119831085205 seconds Wrote graph to binary file in 0.03822636604309082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872533 Louvain completed 21 runs in 1.2348878383636475 seconds PhenoGraph complete in 2.017181396484375 seconds Found communities [-1, ... 15], with sizes: [228, 729, 339, 172, 106, 105, 99, 90, 84, 83, 63, 58, 45, 45, 43, 21, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4074375629425049 seconds Jaccard graph constructed in 0.4670424461364746 seconds Wrote graph to binary file in 0.05050349235534668 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873935 Louvain completed 21 runs in 1.2327196598052979 seconds PhenoGraph complete in 2.1718688011169434 seconds Found communities [-1, ... 15], with sizes: [207, 730, 296, 203, 170, 135, 88, 84, 82, 67, 61, 49, 45, 39, 33, 20, 12] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30725574493408203 seconds Jaccard graph constructed in 0.46344733238220215 seconds Wrote graph to binary file in 0.06349396705627441 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873975 Louvain completed 21 runs in 1.2412848472595215 seconds PhenoGraph complete in 2.0901646614074707 seconds Found communities [-1, ... 19], with sizes: [200, 493, 258, 213, 188, 185, 136, 97, 88, 78, 62, 61, 54, 45, 43, 43, 23, 14, 14, 14, 12]
sc.pp.normalize_per_cell(D322_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Int1) # log transform the data
D322_Biop_Int1.raw = D322_Biop_Int1 # freeze the object (for later use of the raw state of it)
D322_Biop_Int1 = D322_Biop_Int1[:, D322_Biop_Int1.var['ribo_genes']]
D322_Biop_Int1
View of AnnData object with n_obs × n_vars = 1857 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D326_Biop_Int1 = sc.read_10x_mtx(
'./D326_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Biop_Int1.var_names_make_unique()
D326_Biop_Int1.obs['manip'] = 'D326_Biop_Int1'
D326_Biop_Int1.obs['position'] = 'Intermediate'
D326_Biop_Int1.obs['method'] = 'Biopsy'
D326_Biop_Int1.obs['donor'] = 'D326'
D326_Biop_Int1.obs['name'] = ['D326_Biop_Int1_' + s for s in list(D326_Biop_Int1.obs.index)]
D326_Biop_Int1.obs_names = D326_Biop_Int1.obs['name']
D326_Biop_Int1
... reading from cache file ./cache/D326_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1248 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D326_Biop_Int1, n_top=20)
sc.pp.filter_cells(D326_Biop_Int1, min_genes=0)
mito_genes = D326_Biop_Int1.var_names.str.startswith('MT-')
D326_Biop_Int1.obs['percent_mito'] = np.sum(
D326_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.obs['n_counts'] = D326_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Int1.to_df())
ribo_genes = D326_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Int1.obs['percent_ribo'] = np.sum(
D326_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D326_Biop_Int1, min_genes=500)
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['n_counts'] < 25000, :]
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['percent_mito'] < 0.3, :]
filtered out 11 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D326_Biop_Int1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Biop_Int1.obs['doublet_scores'] = doublet_scores
D326_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.11 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 16.1% Overall doublet rate: Expected = 1.1% Estimated = 4.1% Elapsed time: 0.7 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb019b240>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb646c240>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Biop_Int1.X).predict()
D326_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21250677108764648 seconds Jaccard graph constructed in 0.3636820316314697 seconds Wrote graph to binary file in 0.02783966064453125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872555 After 2 runs, maximum modularity is Q = 0.876168 Louvain completed 22 runs in 1.3342251777648926 seconds PhenoGraph complete in 1.9531424045562744 seconds Found communities [-1, ... 16], with sizes: [245, 265, 183, 152, 106, 72, 70, 69, 64, 60, 55, 40, 35, 31, 29, 28, 15, 13] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21496820449829102 seconds Jaccard graph constructed in 0.37735795974731445 seconds Wrote graph to binary file in 0.026920080184936523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871503 Louvain completed 21 runs in 1.1651508808135986 seconds PhenoGraph complete in 1.794358730316162 seconds Found communities [-1, ... 14], with sizes: [228, 280, 179, 152, 142, 84, 83, 83, 65, 65, 42, 36, 29, 29, 23, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.206953763961792 seconds Jaccard graph constructed in 0.42186832427978516 seconds Wrote graph to binary file in 0.022740840911865234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875249 Louvain completed 21 runs in 1.135221242904663 seconds PhenoGraph complete in 1.798593521118164 seconds Found communities [-1, ... 14], with sizes: [244, 257, 201, 170, 103, 88, 82, 66, 62, 56, 44, 43, 38, 29, 26, 23] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21445083618164062 seconds Jaccard graph constructed in 0.43778324127197266 seconds Wrote graph to binary file in 0.02095937728881836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.870239 After 5 runs, maximum modularity is Q = 0.871775 Louvain completed 25 runs in 1.5078670978546143 seconds PhenoGraph complete in 2.190139055252075 seconds Found communities [-1, ... 16], with sizes: [207, 257, 197, 133, 121, 91, 87, 70, 56, 50, 47, 46, 39, 32, 32, 29, 26, 12] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20624947547912598 seconds Jaccard graph constructed in 0.3668174743652344 seconds Wrote graph to binary file in 0.25129103660583496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87048 After 2 runs, maximum modularity is Q = 0.872262 Louvain completed 22 runs in 1.370293140411377 seconds PhenoGraph complete in 2.2099478244781494 seconds Found communities [-1, ... 13], with sizes: [226, 240, 234, 188, 120, 81, 76, 75, 68, 62, 62, 37, 28, 20, 15] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21539044380187988 seconds Jaccard graph constructed in 0.3713223934173584 seconds Wrote graph to binary file in 0.03494429588317871 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875937 Louvain completed 21 runs in 1.1226935386657715 seconds PhenoGraph complete in 1.7589333057403564 seconds Found communities [-1, ... 16], with sizes: [232, 260, 166, 113, 103, 87, 82, 78, 75, 60, 59, 53, 45, 37, 27, 24, 16, 15] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21239542961120605 seconds Jaccard graph constructed in 0.4254121780395508 seconds Wrote graph to binary file in 0.025604963302612305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.873557 Louvain completed 21 runs in 1.123821496963501 seconds PhenoGraph complete in 1.7998547554016113 seconds Found communities [-1, ... 14], with sizes: [233, 290, 165, 124, 105, 94, 81, 81, 75, 69, 55, 41, 40, 28, 27, 24] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2119286060333252 seconds Jaccard graph constructed in 0.3564941883087158 seconds Wrote graph to binary file in 0.030817031860351562 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872046 After 2 runs, maximum modularity is Q = 0.873677 After 4 runs, maximum modularity is Q = 0.875019 Louvain completed 24 runs in 1.707763910293579 seconds PhenoGraph complete in 2.3293588161468506 seconds Found communities [-1, ... 16], with sizes: [223, 282, 188, 180, 121, 104, 76, 50, 45, 43, 37, 35, 30, 28, 28, 27, 23, 12] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21380305290222168 seconds Jaccard graph constructed in 0.3793606758117676 seconds Wrote graph to binary file in 0.04333972930908203 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871621 After 3 runs, maximum modularity is Q = 0.8731 Louvain completed 23 runs in 1.4189717769622803 seconds PhenoGraph complete in 2.067413806915283 seconds Found communities [-1, ... 14], with sizes: [229, 359, 175, 124, 121, 83, 73, 70, 59, 56, 50, 36, 32, 28, 25, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21209287643432617 seconds Jaccard graph constructed in 0.3780078887939453 seconds Wrote graph to binary file in 0.03366875648498535 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876966 Louvain completed 21 runs in 1.1465270519256592 seconds PhenoGraph complete in 1.7857649326324463 seconds Found communities [-1, ... 18], with sizes: [234, 245, 180, 142, 91, 86, 71, 69, 57, 51, 46, 41, 39, 39, 39, 27, 26, 26, 12, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21363592147827148 seconds Jaccard graph constructed in 0.37618112564086914 seconds Wrote graph to binary file in 0.2508823871612549 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875494 Louvain completed 21 runs in 1.1351969242095947 seconds PhenoGraph complete in 1.9862525463104248 seconds Found communities [-1, ... 14], with sizes: [243, 263, 190, 152, 137, 86, 82, 65, 62, 61, 57, 53, 32, 27, 11, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21294927597045898 seconds Jaccard graph constructed in 0.40344810485839844 seconds Wrote graph to binary file in 0.02588486671447754 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868243 Louvain completed 21 runs in 1.1395483016967773 seconds PhenoGraph complete in 1.7931420803070068 seconds Found communities [-1, ... 13], with sizes: [243, 264, 180, 169, 109, 101, 85, 80, 64, 64, 51, 39, 31, 27, 25] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20701980590820312 seconds Jaccard graph constructed in 0.37091708183288574 seconds Wrote graph to binary file in 0.031915903091430664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872594 Louvain completed 21 runs in 1.142817735671997 seconds PhenoGraph complete in 1.767308235168457 seconds Found communities [-1, ... 14], with sizes: [242, 292, 171, 167, 125, 86, 81, 74, 52, 50, 42, 38, 37, 31, 30, 14] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21493244171142578 seconds Jaccard graph constructed in 0.3666503429412842 seconds Wrote graph to binary file in 0.03907918930053711 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869055 Louvain completed 21 runs in 1.132624626159668 seconds PhenoGraph complete in 1.7644095420837402 seconds Found communities [-1, ... 16], with sizes: [224, 278, 205, 134, 126, 84, 72, 60, 60, 58, 52, 43, 38, 29, 24, 23, 11, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.215256929397583 seconds Jaccard graph constructed in 0.43773579597473145 seconds Wrote graph to binary file in 0.026654720306396484 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.868166 After 6 runs, maximum modularity is Q = 0.869605 Louvain completed 26 runs in 1.5238149166107178 seconds PhenoGraph complete in 2.2181923389434814 seconds Found communities [-1, ... 14], with sizes: [229, 260, 182, 145, 120, 114, 94, 84, 79, 40, 39, 39, 37, 28, 21, 21] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21446871757507324 seconds Jaccard graph constructed in 0.37337732315063477 seconds Wrote graph to binary file in 0.05069231986999512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.872011 After 5 runs, maximum modularity is Q = 0.873111 Louvain completed 25 runs in 1.4956026077270508 seconds PhenoGraph complete in 2.1447155475616455 seconds Found communities [-1, ... 16], with sizes: [262, 237, 201, 140, 116, 92, 78, 70, 55, 43, 40, 37, 35, 29, 29, 26, 21, 21] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21381545066833496 seconds Jaccard graph constructed in 0.3762679100036621 seconds Wrote graph to binary file in 0.03141474723815918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.871991 Louvain completed 21 runs in 1.1394329071044922 seconds PhenoGraph complete in 1.7746365070343018 seconds Found communities [-1, ... 16], with sizes: [220, 252, 223, 157, 93, 90, 82, 70, 66, 56, 55, 36, 35, 26, 25, 21, 13, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21435928344726562 seconds Jaccard graph constructed in 0.36531615257263184 seconds Wrote graph to binary file in 0.24006342887878418 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869831 After 2 runs, maximum modularity is Q = 0.871462 Louvain completed 22 runs in 1.3819866180419922 seconds PhenoGraph complete in 2.2126288414001465 seconds Found communities [-1, ... 15], with sizes: [227, 268, 176, 150, 87, 85, 85, 79, 77, 71, 58, 45, 35, 27, 22, 21, 19] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21565794944763184 seconds Jaccard graph constructed in 0.4142270088195801 seconds Wrote graph to binary file in 0.06245684623718262 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87048 After 11 runs, maximum modularity is Q = 0.871603 Louvain completed 31 runs in 2.010638952255249 seconds PhenoGraph complete in 2.71702241897583 seconds Found communities [-1, ... 13], with sizes: [250, 248, 184, 164, 113, 97, 75, 74, 73, 67, 55, 38, 36, 29, 29] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.215773344039917 seconds Jaccard graph constructed in 0.3864133358001709 seconds Wrote graph to binary file in 0.04292774200439453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865432 Louvain completed 21 runs in 1.1393427848815918 seconds PhenoGraph complete in 1.7990338802337646 seconds Found communities [-1, ... 15], with sizes: [230, 245, 207, 174, 115, 93, 84, 82, 80, 46, 35, 29, 27, 26, 23, 23, 13] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21304655075073242 seconds Jaccard graph constructed in 0.36087560653686523 seconds Wrote graph to binary file in 0.03091287612915039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878844 Louvain completed 21 runs in 1.176285982131958 seconds PhenoGraph complete in 1.794126272201538 seconds Found communities [-1, ... 13], with sizes: [247, 282, 166, 159, 158, 87, 81, 69, 66, 54, 46, 36, 28, 27, 26] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21434879302978516 seconds Jaccard graph constructed in 0.3636810779571533 seconds Wrote graph to binary file in 0.03107428550720215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.874058 After 6 runs, maximum modularity is Q = 0.875109 Louvain completed 26 runs in 1.6364972591400146 seconds PhenoGraph complete in 2.2630796432495117 seconds Found communities [-1, ... 16], with sizes: [258, 260, 184, 96, 78, 78, 77, 74, 64, 60, 59, 58, 51, 35, 32, 28, 22, 18] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21393156051635742 seconds Jaccard graph constructed in 0.42297863960266113 seconds Wrote graph to binary file in 0.025583982467651367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877464 Louvain completed 21 runs in 1.1184568405151367 seconds PhenoGraph complete in 1.7980844974517822 seconds Found communities [-1, ... 16], with sizes: [185, 257, 236, 128, 120, 80, 78, 73, 73, 65, 57, 34, 33, 29, 28, 26, 18, 12] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.206801176071167 seconds Jaccard graph constructed in 0.4006519317626953 seconds Wrote graph to binary file in 0.03341984748840332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876441 Louvain completed 21 runs in 1.320981740951538 seconds PhenoGraph complete in 1.974416732788086 seconds Found communities [-1, ... 13], with sizes: [226, 258, 236, 138, 115, 89, 77, 71, 71, 68, 56, 39, 35, 30, 23] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20917606353759766 seconds Jaccard graph constructed in 0.372455358505249 seconds Wrote graph to binary file in 0.03403520584106445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869474 Louvain completed 21 runs in 1.1219048500061035 seconds PhenoGraph complete in 1.748931884765625 seconds Found communities [-1, ... 14], with sizes: [256, 281, 185, 154, 118, 86, 75, 73, 69, 54, 43, 34, 28, 27, 25, 24]
sc.pp.normalize_per_cell(D326_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Int1) # log transform the data
D326_Biop_Int1.raw = D326_Biop_Int1 # freeze the object (for later use of the raw state of it)
D326_Biop_Int1 = D326_Biop_Int1[:, D326_Biop_Int1.var['ribo_genes']]
D326_Biop_Int1
View of AnnData object with n_obs × n_vars = 1226 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D339_Biop_Int1 = sc.read_10x_mtx(
'./D339_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Biop_Int1.var_names_make_unique()
D339_Biop_Int1.obs['manip'] = 'D339_Biop_Int1'
D339_Biop_Int1.obs['position'] = 'Intermediate'
D339_Biop_Int1.obs['method'] = 'Biopsy'
D339_Biop_Int1.obs['donor'] = 'D339'
D339_Biop_Int1.obs['name'] = ['D339_Biop_Int1_' + s for s in list(D339_Biop_Int1.obs.index)]
D339_Biop_Int1.obs_names = D339_Biop_Int1.obs['name']
D339_Biop_Int1
... reading from cache file ./cache/D339_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 3348 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D339_Biop_Int1, n_top=20)
sc.pp.filter_cells(D339_Biop_Int1, min_genes=0)
mito_genes = D339_Biop_Int1.var_names.str.startswith('MT-')
D339_Biop_Int1.obs['percent_mito'] = np.sum(
D339_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.obs['n_counts'] = D339_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Int1.to_df())
ribo_genes = D339_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Int1.obs['percent_ribo'] = np.sum(
D339_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D339_Biop_Int1, min_genes=500)
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['n_counts'] < 30000, :]
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['percent_mito'] < 0.15, :]
filtered out 9 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D339_Biop_Int1.X, expected_doublet_rate=0.026)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Int1.obs['doublet_scores'] = doublet_scores
D339_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.32 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 10.6% Overall doublet rate: Expected = 2.6% Estimated = 3.7% Elapsed time: 2.3 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9ab0a20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb0147860>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Int1.X).predict()
D339_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6120181083679199 seconds Jaccard graph constructed in 0.6159374713897705 seconds Wrote graph to binary file in 0.06875061988830566 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910447 Louvain completed 21 runs in 1.5944838523864746 seconds PhenoGraph complete in 2.907468795776367 seconds Found communities [-1, ... 27], with sizes: [111, 718, 381, 364, 245, 229, 189, 186, 176, 133, 132, 128, 122, 122, 119, 117, 108, 96, 80, 71, 57, 54, 48, 46, 34, 28, 28, 22, 18] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7108383178710938 seconds Jaccard graph constructed in 0.5960428714752197 seconds Wrote graph to binary file in 0.2878448963165283 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912211 Louvain completed 21 runs in 1.5562129020690918 seconds PhenoGraph complete in 3.165850877761841 seconds Found communities [-1, ... 25], with sizes: [99, 720, 376, 366, 342, 228, 218, 177, 167, 162, 136, 129, 123, 120, 118, 116, 110, 86, 74, 50, 49, 48, 48, 36, 28, 22, 14] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8106706142425537 seconds Jaccard graph constructed in 0.7108263969421387 seconds Wrote graph to binary file in 0.08755302429199219 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909643 Louvain completed 21 runs in 1.5580415725708008 seconds PhenoGraph complete in 3.1868462562561035 seconds Found communities [-1, ... 22], with sizes: [112, 760, 412, 365, 348, 344, 265, 180, 174, 160, 158, 135, 118, 117, 112, 83, 62, 52, 51, 46, 41, 33, 20, 14] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7121298313140869 seconds Jaccard graph constructed in 0.8929822444915771 seconds Wrote graph to binary file in 0.07616424560546875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912137 Louvain completed 21 runs in 1.5405664443969727 seconds PhenoGraph complete in 3.241511106491089 seconds Found communities [-1, ... 23], with sizes: [107, 789, 460, 405, 373, 342, 230, 171, 167, 156, 131, 124, 106, 90, 82, 77, 74, 64, 46, 46, 41, 29, 18, 18, 16] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8125925064086914 seconds Jaccard graph constructed in 0.6645140647888184 seconds Wrote graph to binary file in 0.24959516525268555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911793 Louvain completed 21 runs in 1.548281192779541 seconds PhenoGraph complete in 3.296985149383545 seconds Found communities [-1, ... 23], with sizes: [121, 730, 382, 369, 337, 331, 213, 178, 158, 146, 140, 130, 123, 120, 119, 113, 91, 79, 63, 60, 59, 42, 30, 14, 14] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7121119499206543 seconds Jaccard graph constructed in 0.6611349582672119 seconds Wrote graph to binary file in 0.07588648796081543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91247 Louvain completed 21 runs in 1.5824344158172607 seconds PhenoGraph complete in 3.051260471343994 seconds Found communities [-1, ... 25], with sizes: [79, 786, 456, 362, 319, 309, 194, 186, 177, 170, 163, 149, 133, 87, 82, 79, 67, 62, 59, 54, 40, 33, 29, 29, 22, 19, 17] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8096714019775391 seconds Jaccard graph constructed in 0.6530439853668213 seconds Wrote graph to binary file in 0.28012943267822266 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913229 Louvain completed 21 runs in 1.5736398696899414 seconds PhenoGraph complete in 3.3335275650024414 seconds Found communities [-1, ... 26], with sizes: [130, 722, 365, 361, 359, 272, 240, 238, 191, 140, 140, 126, 120, 115, 105, 91, 82, 59, 47, 47, 44, 43, 31, 30, 20, 17, 15, 12] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.811260461807251 seconds Jaccard graph constructed in 0.6973435878753662 seconds Wrote graph to binary file in 0.0801093578338623 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912733 Louvain completed 21 runs in 1.5724260807037354 seconds PhenoGraph complete in 3.179330587387085 seconds Found communities [-1, ... 24], with sizes: [107, 762, 383, 376, 335, 329, 209, 179, 178, 171, 128, 127, 116, 112, 106, 84, 80, 64, 64, 62, 53, 44, 33, 27, 18, 15] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8147614002227783 seconds Jaccard graph constructed in 0.6282577514648438 seconds Wrote graph to binary file in 0.07687115669250488 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909985 After 3 runs, maximum modularity is Q = 0.911042 Louvain completed 23 runs in 1.9309871196746826 seconds PhenoGraph complete in 3.4667675495147705 seconds Found communities [-1, ... 24], with sizes: [97, 741, 376, 367, 342, 305, 243, 194, 193, 161, 140, 121, 115, 113, 108, 103, 92, 83, 66, 51, 41, 36, 35, 16, 12, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8121042251586914 seconds Jaccard graph constructed in 0.6219890117645264 seconds Wrote graph to binary file in 0.26953816413879395 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910983 Louvain completed 21 runs in 1.5791409015655518 seconds PhenoGraph complete in 3.3028202056884766 seconds Found communities [-1, ... 26], with sizes: [98, 747, 363, 357, 347, 209, 208, 183, 164, 161, 148, 147, 129, 123, 115, 104, 100, 83, 65, 53, 47, 47, 45, 36, 32, 19, 18, 14] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8197469711303711 seconds Jaccard graph constructed in 0.665412425994873 seconds Wrote graph to binary file in 0.27089881896972656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910454 Louvain completed 21 runs in 1.58369779586792 seconds PhenoGraph complete in 3.355931282043457 seconds Found communities [-1, ... 23], with sizes: [124, 743, 364, 364, 347, 266, 238, 234, 203, 155, 144, 132, 118, 105, 101, 90, 85, 82, 62, 60, 44, 43, 29, 17, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7104980945587158 seconds Jaccard graph constructed in 0.608994722366333 seconds Wrote graph to binary file in 0.0742805004119873 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909149 After 2 runs, maximum modularity is Q = 0.910778 Louvain completed 22 runs in 1.8385138511657715 seconds PhenoGraph complete in 3.2510221004486084 seconds Found communities [-1, ... 23], with sizes: [122, 721, 467, 394, 364, 262, 246, 219, 201, 150, 138, 133, 131, 114, 110, 82, 53, 47, 42, 40, 39, 37, 20, 17, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7156820297241211 seconds Jaccard graph constructed in 0.6979537010192871 seconds Wrote graph to binary file in 0.08060169219970703 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909846 Louvain completed 21 runs in 1.7623915672302246 seconds PhenoGraph complete in 3.2784769535064697 seconds Found communities [-1, ... 22], with sizes: [118, 711, 397, 376, 362, 321, 268, 255, 173, 148, 135, 123, 107, 106, 103, 82, 77, 74, 62, 53, 43, 29, 20, 19] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7170310020446777 seconds Jaccard graph constructed in 0.8543190956115723 seconds Wrote graph to binary file in 0.07794928550720215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907781 After 10 runs, maximum modularity is Q = 0.908969 Louvain completed 30 runs in 2.3561477661132812 seconds PhenoGraph complete in 4.0231993198394775 seconds Found communities [-1, ... 23], with sizes: [141, 726, 394, 368, 345, 323, 266, 221, 187, 143, 136, 128, 117, 110, 100, 80, 75, 67, 56, 47, 40, 33, 29, 16, 14] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8115262985229492 seconds Jaccard graph constructed in 0.6196231842041016 seconds Wrote graph to binary file in 0.27695226669311523 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909351 After 18 runs, maximum modularity is Q = 0.910505 Louvain completed 38 runs in 2.8731987476348877 seconds PhenoGraph complete in 4.599445819854736 seconds Found communities [-1, ... 23], with sizes: [123, 719, 397, 382, 367, 367, 268, 216, 159, 137, 129, 123, 117, 117, 85, 83, 78, 61, 54, 53, 41, 30, 22, 19, 15] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7128534317016602 seconds Jaccard graph constructed in 0.6222641468048096 seconds Wrote graph to binary file in 0.07622623443603516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913185 After 3 runs, maximum modularity is Q = 0.914432 Louvain completed 23 runs in 1.8979613780975342 seconds PhenoGraph complete in 3.325296640396118 seconds Found communities [-1, ... 23], with sizes: [131, 749, 377, 362, 330, 250, 239, 231, 175, 166, 153, 141, 128, 117, 111, 100, 79, 74, 56, 47, 41, 38, 34, 18, 15] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8107166290283203 seconds Jaccard graph constructed in 0.6339046955108643 seconds Wrote graph to binary file in 0.2551889419555664 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909121 Louvain completed 21 runs in 1.600229263305664 seconds PhenoGraph complete in 3.315030813217163 seconds Found communities [-1, ... 22], with sizes: [137, 790, 363, 361, 353, 341, 331, 221, 144, 141, 121, 116, 114, 113, 112, 81, 61, 60, 53, 47, 39, 31, 17, 15] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7117364406585693 seconds Jaccard graph constructed in 0.6509895324707031 seconds Wrote graph to binary file in 0.07445740699768066 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908248 After 2 runs, maximum modularity is Q = 0.90925 After 9 runs, maximum modularity is Q = 0.910292 Louvain completed 29 runs in 2.4937171936035156 seconds PhenoGraph complete in 3.946629047393799 seconds Found communities [-1, ... 24], with sizes: [146, 757, 370, 334, 307, 237, 201, 201, 173, 172, 157, 153, 148, 139, 133, 110, 80, 65, 63, 63, 42, 28, 27, 21, 18, 17] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8095879554748535 seconds Jaccard graph constructed in 0.6299993991851807 seconds Wrote graph to binary file in 0.07435059547424316 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910371 After 2 runs, maximum modularity is Q = 0.911783 Louvain completed 22 runs in 1.8568203449249268 seconds PhenoGraph complete in 3.386380910873413 seconds Found communities [-1, ... 22], with sizes: [105, 718, 447, 402, 365, 279, 224, 188, 184, 175, 174, 128, 126, 97, 87, 86, 82, 54, 49, 45, 45, 38, 32, 32] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7113001346588135 seconds Jaccard graph constructed in 0.791698694229126 seconds Wrote graph to binary file in 0.07625126838684082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911401 Louvain completed 21 runs in 1.565434455871582 seconds PhenoGraph complete in 3.159489870071411 seconds Found communities [-1, ... 24], with sizes: [150, 697, 409, 357, 338, 216, 206, 182, 173, 150, 134, 134, 128, 126, 119, 109, 102, 85, 81, 65, 60, 43, 41, 26, 16, 15] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8119308948516846 seconds Jaccard graph constructed in 0.601323127746582 seconds Wrote graph to binary file in 0.24887347221374512 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911418 After 12 runs, maximum modularity is Q = 0.912584 Louvain completed 32 runs in 2.530770778656006 seconds PhenoGraph complete in 4.207291603088379 seconds Found communities [-1, ... 26], with sizes: [130, 747, 385, 363, 335, 237, 224, 221, 169, 161, 156, 149, 119, 109, 89, 89, 81, 62, 59, 54, 47, 42, 30, 29, 28, 19, 16, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8119664192199707 seconds Jaccard graph constructed in 0.6528275012969971 seconds Wrote graph to binary file in 0.07746100425720215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909115 After 3 runs, maximum modularity is Q = 0.910285 Louvain completed 23 runs in 1.9448442459106445 seconds PhenoGraph complete in 3.5069580078125 seconds Found communities [-1, ... 25], with sizes: [118, 722, 375, 346, 337, 319, 201, 186, 177, 168, 158, 148, 127, 116, 113, 83, 74, 66, 51, 49, 48, 46, 38, 28, 25, 25, 18] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7134723663330078 seconds Jaccard graph constructed in 0.628441572189331 seconds Wrote graph to binary file in 0.28258347511291504 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91328 After 4 runs, maximum modularity is Q = 0.914467 Louvain completed 24 runs in 2.0038208961486816 seconds PhenoGraph complete in 3.643587350845337 seconds Found communities [-1, ... 24], with sizes: [98, 768, 368, 361, 340, 231, 204, 186, 178, 172, 162, 154, 122, 120, 107, 102, 95, 85, 69, 51, 49, 44, 37, 25, 18, 16] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7119970321655273 seconds Jaccard graph constructed in 0.6226019859313965 seconds Wrote graph to binary file in 0.07487988471984863 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911548 After 13 runs, maximum modularity is Q = 0.912586 Louvain completed 33 runs in 2.5868496894836426 seconds PhenoGraph complete in 4.013131618499756 seconds Found communities [-1, ... 23], with sizes: [112, 807, 373, 369, 326, 290, 254, 194, 191, 183, 163, 136, 133, 115, 109, 81, 59, 54, 47, 47, 38, 33, 19, 16, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8108634948730469 seconds Jaccard graph constructed in 0.6436583995819092 seconds Wrote graph to binary file in 0.07364439964294434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906352 After 5 runs, maximum modularity is Q = 0.907726 Louvain completed 25 runs in 2.038503408432007 seconds PhenoGraph complete in 3.588808298110962 seconds Found communities [-1, ... 23], with sizes: [106, 697, 480, 386, 362, 259, 245, 186, 170, 161, 145, 135, 106, 103, 94, 83, 74, 69, 66, 59, 47, 45, 35, 34, 15]
sc.pp.normalize_per_cell(D339_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Int1) # log transform the data
D339_Biop_Int1.raw = D339_Biop_Int1 # freeze the object (for later use of the raw state of it)
D339_Biop_Int1 = D339_Biop_Int1[:, D339_Biop_Int1.var['ribo_genes']]
D339_Biop_Int1
View of AnnData object with n_obs × n_vars = 3330 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D344_Biop_Int1 = sc.read_10x_mtx(
'./D344_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Biop_Int1.var_names_make_unique()
D344_Biop_Int1.obs['manip'] = 'D344_Biop_Int1'
D344_Biop_Int1.obs['position'] = 'Intermediate'
D344_Biop_Int1.obs['method'] = 'Biopsy'
D344_Biop_Int1.obs['donor'] = 'D344'
D344_Biop_Int1.obs['name'] = ['D344_Biop_Int1_' + s for s in list(D344_Biop_Int1.obs.index)]
D344_Biop_Int1.obs_names = D344_Biop_Int1.obs['name']
D344_Biop_Int1
... reading from cache file ./cache/D344_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1051 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D344_Biop_Int1, n_top=20)
sc.pp.filter_cells(D344_Biop_Int1, min_genes=0)
mito_genes = D344_Biop_Int1.var_names.str.startswith('MT-')
D344_Biop_Int1.obs['percent_mito'] = np.sum(
D344_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.obs['n_counts'] = D344_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Int1.to_df())
ribo_genes = D344_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Int1.obs['percent_ribo'] = np.sum(
D344_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D344_Biop_Int1, min_genes=500)
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['n_counts'] < 10000, :]
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['percent_mito'] < 0.1, :]
filtered out 27 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D344_Biop_Int1.X, expected_doublet_rate=0.008)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Int1.obs['doublet_scores'] = doublet_scores
D344_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.07 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 34.1% Overall doublet rate: Expected = 0.8% Estimated = 1.2% Elapsed time: 0.4 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9f3ec88>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eaa00e128>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Int1.X).predict()
D344_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21145963668823242 seconds Jaccard graph constructed in 0.3449239730834961 seconds Wrote graph to binary file in 0.021248817443847656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880122 Louvain completed 21 runs in 1.1646614074707031 seconds PhenoGraph complete in 1.7518534660339355 seconds Found communities [-1, ... 14], with sizes: [185, 297, 150, 118, 87, 80, 67, 43, 39, 39, 37, 36, 33, 14, 13, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1107933521270752 seconds Jaccard graph constructed in 0.3332080841064453 seconds Wrote graph to binary file in 0.01955890655517578 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87899 Louvain completed 21 runs in 1.208862066268921 seconds PhenoGraph complete in 1.6861658096313477 seconds Found communities [-1, ... 12], with sizes: [178, 258, 213, 180, 83, 79, 55, 46, 41, 41, 33, 18, 14, 11] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10710954666137695 seconds Jaccard graph constructed in 0.3355743885040283 seconds Wrote graph to binary file in 0.01905989646911621 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880221 Louvain completed 21 runs in 1.21779203414917 seconds PhenoGraph complete in 1.6892426013946533 seconds Found communities [-1, ... 13], with sizes: [156, 262, 160, 149, 109, 60, 59, 51, 43, 40, 39, 39, 38, 33, 12] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10601449012756348 seconds Jaccard graph constructed in 0.3769807815551758 seconds Wrote graph to binary file in 0.019910573959350586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882943 Louvain completed 21 runs in 1.262915849685669 seconds PhenoGraph complete in 1.7776107788085938 seconds Found communities [-1, ... 12], with sizes: [193, 296, 197, 162, 78, 56, 54, 50, 41, 39, 34, 28, 11, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10669279098510742 seconds Jaccard graph constructed in 0.3457822799682617 seconds Wrote graph to binary file in 0.0184783935546875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878726 Louvain completed 21 runs in 1.3269686698913574 seconds PhenoGraph complete in 1.8088159561157227 seconds Found communities [-1, ... 15], with sizes: [185, 258, 173, 160, 80, 59, 55, 45, 41, 37, 37, 33, 26, 22, 17, 11, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10851693153381348 seconds Jaccard graph constructed in 0.37672901153564453 seconds Wrote graph to binary file in 0.04572415351867676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882706 Louvain completed 21 runs in 1.1272990703582764 seconds PhenoGraph complete in 1.6748719215393066 seconds Found communities [-1, ... 14], with sizes: [216, 282, 182, 142, 60, 57, 46, 38, 36, 36, 36, 35, 32, 26, 14, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11346316337585449 seconds Jaccard graph constructed in 0.343994140625 seconds Wrote graph to binary file in 0.27814316749572754 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.877563 Louvain completed 21 runs in 1.1701586246490479 seconds PhenoGraph complete in 1.9211146831512451 seconds Found communities [-1, ... 12], with sizes: [180, 305, 207, 157, 58, 57, 46, 46, 42, 39, 33, 33, 32, 15] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1146395206451416 seconds Jaccard graph constructed in 0.3721346855163574 seconds Wrote graph to binary file in 0.03134775161743164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878177 Louvain completed 21 runs in 1.1684675216674805 seconds PhenoGraph complete in 1.6974670886993408 seconds Found communities [-1, ... 11], with sizes: [172, 278, 213, 187, 76, 65, 59, 50, 38, 34, 34, 33, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11188626289367676 seconds Jaccard graph constructed in 0.35836315155029297 seconds Wrote graph to binary file in 0.03089141845703125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880741 Louvain completed 21 runs in 1.1703009605407715 seconds PhenoGraph complete in 1.704502820968628 seconds Found communities [-1, ... 13], with sizes: [227, 259, 191, 156, 59, 53, 47, 44, 40, 39, 37, 36, 33, 17, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11103987693786621 seconds Jaccard graph constructed in 0.3810455799102783 seconds Wrote graph to binary file in 0.0421299934387207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87909 Louvain completed 21 runs in 1.150974988937378 seconds PhenoGraph complete in 1.6959278583526611 seconds Found communities [-1, ... 12], with sizes: [183, 263, 188, 170, 75, 63, 58, 50, 41, 38, 37, 36, 35, 13] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10722994804382324 seconds Jaccard graph constructed in 0.38977789878845215 seconds Wrote graph to binary file in 0.03473496437072754 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880149 Louvain completed 21 runs in 1.3495912551879883 seconds PhenoGraph complete in 1.8937199115753174 seconds Found communities [-1, ... 12], with sizes: [215, 280, 156, 153, 85, 66, 55, 52, 50, 45, 38, 33, 11, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10716843605041504 seconds Jaccard graph constructed in 0.36541080474853516 seconds Wrote graph to binary file in 0.03273296356201172 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880606 Louvain completed 21 runs in 1.1485414505004883 seconds PhenoGraph complete in 1.663989543914795 seconds Found communities [-1, ... 12], with sizes: [172, 310, 177, 157, 78, 56, 47, 44, 40, 39, 39, 37, 32, 22] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10730195045471191 seconds Jaccard graph constructed in 0.3606255054473877 seconds Wrote graph to binary file in 0.030358552932739258 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876205 Louvain completed 21 runs in 1.1489663124084473 seconds PhenoGraph complete in 1.65879487991333 seconds Found communities [-1, ... 10], with sizes: [189, 281, 202, 164, 78, 73, 72, 45, 41, 36, 36, 33] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11252951622009277 seconds Jaccard graph constructed in 0.35149550437927246 seconds Wrote graph to binary file in 0.03131890296936035 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88077 Louvain completed 21 runs in 1.1583356857299805 seconds PhenoGraph complete in 1.6760649681091309 seconds Found communities [-1, ... 12], with sizes: [178, 305, 158, 157, 80, 71, 53, 51, 42, 40, 38, 36, 22, 19] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10689330101013184 seconds Jaccard graph constructed in 0.3616914749145508 seconds Wrote graph to binary file in 0.03846144676208496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878707 Louvain completed 21 runs in 1.153308391571045 seconds PhenoGraph complete in 1.6763839721679688 seconds Found communities [-1, ... 11], with sizes: [196, 284, 167, 138, 113, 75, 56, 52, 42, 36, 36, 32, 23] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10843348503112793 seconds Jaccard graph constructed in 0.37569642066955566 seconds Wrote graph to binary file in 0.05348014831542969 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87889 Louvain completed 21 runs in 1.142681360244751 seconds PhenoGraph complete in 1.6932039260864258 seconds Found communities [-1, ... 12], with sizes: [174, 277, 158, 157, 117, 71, 69, 48, 38, 36, 36, 34, 24, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10575532913208008 seconds Jaccard graph constructed in 0.36235928535461426 seconds Wrote graph to binary file in 0.028481245040893555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882936 Louvain completed 21 runs in 1.1972129344940186 seconds PhenoGraph complete in 1.7038843631744385 seconds Found communities [-1, ... 13], with sizes: [196, 314, 162, 160, 56, 54, 49, 41, 40, 40, 40, 39, 33, 13, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11168026924133301 seconds Jaccard graph constructed in 0.3540842533111572 seconds Wrote graph to binary file in 0.27155423164367676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.881863 Louvain completed 21 runs in 1.1280722618103027 seconds PhenoGraph complete in 1.8748860359191895 seconds Found communities [-1, ... 13], with sizes: [208, 289, 156, 98, 78, 68, 63, 56, 46, 46, 41, 37, 33, 20, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11098194122314453 seconds Jaccard graph constructed in 0.4335055351257324 seconds Wrote graph to binary file in 0.022338390350341797 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.876497 Louvain completed 21 runs in 1.1879448890686035 seconds PhenoGraph complete in 1.7635250091552734 seconds Found communities [-1, ... 12], with sizes: [187, 276, 180, 166, 80, 76, 54, 50, 42, 38, 36, 31, 18, 16] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11251401901245117 seconds Jaccard graph constructed in 0.3694918155670166 seconds Wrote graph to binary file in 0.03946948051452637 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87899 Louvain completed 21 runs in 1.168576717376709 seconds PhenoGraph complete in 1.7103748321533203 seconds Found communities [-1, ... 12], with sizes: [199, 297, 198, 167, 86, 47, 42, 40, 36, 34, 33, 32, 23, 16] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11192011833190918 seconds Jaccard graph constructed in 0.3561713695526123 seconds Wrote graph to binary file in 0.02805948257446289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88075 Louvain completed 21 runs in 1.1818695068359375 seconds PhenoGraph complete in 1.6874995231628418 seconds Found communities [-1, ... 13], with sizes: [146, 297, 179, 160, 74, 69, 60, 55, 47, 45, 41, 33, 16, 15, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1140294075012207 seconds Jaccard graph constructed in 0.4274132251739502 seconds Wrote graph to binary file in 0.021744966506958008 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88156 Louvain completed 21 runs in 1.1601512432098389 seconds PhenoGraph complete in 1.7319869995117188 seconds Found communities [-1, ... 11], with sizes: [198, 283, 206, 178, 90, 50, 42, 42, 41, 35, 35, 33, 17] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11182069778442383 seconds Jaccard graph constructed in 0.35143613815307617 seconds Wrote graph to binary file in 0.03067612648010254 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875879 Louvain completed 21 runs in 1.1906437873840332 seconds PhenoGraph complete in 1.6964304447174072 seconds Found communities [-1, ... 10], with sizes: [203, 282, 183, 174, 93, 76, 72, 41, 34, 34, 33, 25] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11396241188049316 seconds Jaccard graph constructed in 0.35881948471069336 seconds Wrote graph to binary file in 0.03446149826049805 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87648 Louvain completed 21 runs in 1.1733543872833252 seconds PhenoGraph complete in 1.6909277439117432 seconds Found communities [-1, ... 9], with sizes: [171, 308, 205, 179, 80, 76, 67, 48, 47, 35, 34] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1117711067199707 seconds Jaccard graph constructed in 0.36495161056518555 seconds Wrote graph to binary file in 0.0581965446472168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.878107 Louvain completed 21 runs in 1.1425848007202148 seconds PhenoGraph complete in 1.690443992614746 seconds Found communities [-1, ... 12], with sizes: [192, 305, 194, 182, 71, 49, 42, 41, 38, 37, 32, 31, 23, 13]
sc.pp.normalize_per_cell(D344_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Int1) # log transform the data
D344_Biop_Int1.raw = D344_Biop_Int1 # freeze the object (for later use of the raw state of it)
D344_Biop_Int1 = D344_Biop_Int1[:, D344_Biop_Int1.var['ribo_genes']]
D344_Biop_Int1
View of AnnData object with n_obs × n_vars = 1000 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D353_Biop_Int2 = sc.read_10x_mtx(
'./D353_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Biop_Int2.var_names_make_unique()
D353_Biop_Int2.obs['manip'] = 'D353_Biop_Int2'
D353_Biop_Int2.obs['position'] = 'Intermediate'
D353_Biop_Int2.obs['method'] = 'Biopsy'
D353_Biop_Int2.obs['donor'] = 'D353'
D353_Biop_Int2.obs['name'] = ['D353_Biop_Int2_' + s for s in list(D353_Biop_Int2.obs.index)]
D353_Biop_Int2.obs_names = D353_Biop_Int2.obs['name']
D353_Biop_Int2
... reading from cache file ./cache/D353_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2291 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D353_Biop_Int2, n_top=20)
sc.pp.filter_cells(D353_Biop_Int2, min_genes=0)
mito_genes = D353_Biop_Int2.var_names.str.startswith('MT-')
D353_Biop_Int2.obs['percent_mito'] = np.sum(
D353_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.obs['n_counts'] = D353_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Int2.to_df())
ribo_genes = D353_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Int2.obs['percent_ribo'] = np.sum(
D353_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D353_Biop_Int2, min_genes=500)
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['n_counts'] < 10000, :]
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['percent_mito'] < 0.15, :]
filtered out 72 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D353_Biop_Int2.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Biop_Int2.obs['doublet_scores'] = doublet_scores
D353_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.21 Detected doublet rate = 0.5% Estimated detectable doublet fraction = 14.3% Overall doublet rate: Expected = 1.8% Estimated = 3.8% Elapsed time: 1.1 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9933cf8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb95dc9e8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Biop_Int2.X).predict()
D353_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4091353416442871 seconds Jaccard graph constructed in 0.6043984889984131 seconds Wrote graph to binary file in 0.03512072563171387 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890691 Louvain completed 21 runs in 1.3956241607666016 seconds PhenoGraph complete in 2.4567246437072754 seconds Found communities [-1, ... 15], with sizes: [228, 1040, 693, 144, 95, 79, 78, 61, 60, 46, 44, 39, 38, 28, 26, 24, 22] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4080648422241211 seconds Jaccard graph constructed in 0.5825128555297852 seconds Wrote graph to binary file in 0.03633427619934082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893712 Louvain completed 21 runs in 1.4647831916809082 seconds PhenoGraph complete in 2.5091614723205566 seconds Found communities [-1, ... 16], with sizes: [222, 1040, 356, 332, 134, 95, 81, 78, 76, 65, 50, 48, 35, 34, 30, 26, 23, 20] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40767621994018555 seconds Jaccard graph constructed in 0.5647685527801514 seconds Wrote graph to binary file in 0.03490257263183594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893473 Louvain completed 21 runs in 1.440371036529541 seconds PhenoGraph complete in 2.4670298099517822 seconds Found communities [-1, ... 15], with sizes: [194, 1066, 496, 228, 139, 90, 80, 78, 78, 67, 47, 47, 35, 33, 28, 25, 14] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4076704978942871 seconds Jaccard graph constructed in 0.539881706237793 seconds Wrote graph to binary file in 0.292696475982666 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891316 After 6 runs, maximum modularity is Q = 0.892575 Louvain completed 26 runs in 2.03838849067688 seconds PhenoGraph complete in 3.292109489440918 seconds Found communities [-1, ... 15], with sizes: [278, 1029, 358, 343, 132, 86, 81, 75, 59, 58, 49, 45, 41, 31, 30, 26, 24] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.517542839050293 seconds Jaccard graph constructed in 0.5562949180603027 seconds Wrote graph to binary file in 0.03730154037475586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895413 Louvain completed 21 runs in 1.4487016201019287 seconds PhenoGraph complete in 2.579259157180786 seconds Found communities [-1, ... 17], with sizes: [177, 1055, 568, 179, 136, 101, 81, 76, 67, 47, 41, 38, 38, 35, 34, 26, 20, 15, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5140190124511719 seconds Jaccard graph constructed in 0.5885076522827148 seconds Wrote graph to binary file in 0.03614401817321777 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890801 Louvain completed 21 runs in 1.4445881843566895 seconds PhenoGraph complete in 2.5949974060058594 seconds Found communities [-1, ... 16], with sizes: [247, 1060, 357, 324, 123, 101, 99, 78, 68, 46, 43, 41, 41, 28, 25, 24, 21, 19] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5123090744018555 seconds Jaccard graph constructed in 0.555894136428833 seconds Wrote graph to binary file in 0.03653597831726074 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893846 Louvain completed 21 runs in 1.4240570068359375 seconds PhenoGraph complete in 2.542696714401245 seconds Found communities [-1, ... 15], with sizes: [240, 999, 384, 373, 116, 91, 87, 84, 73, 64, 47, 45, 41, 35, 28, 23, 15] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4090099334716797 seconds Jaccard graph constructed in 0.5889580249786377 seconds Wrote graph to binary file in 0.035860300064086914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896264 Louvain completed 21 runs in 1.4150359630584717 seconds PhenoGraph complete in 2.462589979171753 seconds Found communities [-1, ... 16], with sizes: [190, 1051, 369, 353, 154, 78, 75, 68, 66, 63, 47, 44, 39, 38, 35, 31, 28, 16] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40950894355773926 seconds Jaccard graph constructed in 0.8809564113616943 seconds Wrote graph to binary file in 0.03651571273803711 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89772 Louvain completed 21 runs in 1.3908169269561768 seconds PhenoGraph complete in 2.7316763401031494 seconds Found communities [-1, ... 15], with sizes: [205, 1043, 367, 352, 136, 103, 86, 82, 78, 61, 47, 45, 43, 32, 29, 25, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.511075496673584 seconds Jaccard graph constructed in 0.5945179462432861 seconds Wrote graph to binary file in 0.035491943359375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887438 Louvain completed 21 runs in 1.40632963180542 seconds PhenoGraph complete in 2.562772274017334 seconds Found communities [-1, ... 16], with sizes: [246, 988, 499, 261, 114, 89, 87, 76, 72, 65, 48, 43, 40, 40, 25, 24, 17, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5103023052215576 seconds Jaccard graph constructed in 0.5586209297180176 seconds Wrote graph to binary file in 0.03708052635192871 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893353 Louvain completed 21 runs in 1.405564308166504 seconds PhenoGraph complete in 2.523803949356079 seconds Found communities [-1, ... 15], with sizes: [215, 1133, 363, 285, 149, 85, 79, 73, 68, 67, 52, 42, 39, 31, 26, 25, 13] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4121861457824707 seconds Jaccard graph constructed in 0.5916233062744141 seconds Wrote graph to binary file in 0.036779165267944336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.892858 Louvain completed 21 runs in 1.4294745922088623 seconds PhenoGraph complete in 2.4855849742889404 seconds Found communities [-1, ... 17], with sizes: [192, 1026, 384, 361, 114, 104, 77, 72, 71, 63, 53, 46, 43, 28, 28, 27, 24, 16, 16] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5090172290802002 seconds Jaccard graph constructed in 0.5467166900634766 seconds Wrote graph to binary file in 0.2511570453643799 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896091 Louvain completed 21 runs in 1.407749891281128 seconds PhenoGraph complete in 2.729004383087158 seconds Found communities [-1, ... 17], with sizes: [233, 1007, 380, 349, 155, 92, 79, 78, 59, 55, 46, 41, 35, 32, 29, 26, 23, 14, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41003990173339844 seconds Jaccard graph constructed in 0.5629570484161377 seconds Wrote graph to binary file in 0.05934643745422363 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89884 Louvain completed 21 runs in 1.4473161697387695 seconds PhenoGraph complete in 2.4936909675598145 seconds Found communities [-1, ... 16], with sizes: [194, 1021, 535, 230, 145, 92, 77, 71, 68, 68, 51, 43, 36, 33, 30, 28, 12, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4100019931793213 seconds Jaccard graph constructed in 0.5669147968292236 seconds Wrote graph to binary file in 0.03788328170776367 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895837 Louvain completed 21 runs in 1.384540319442749 seconds PhenoGraph complete in 2.417604684829712 seconds Found communities [-1, ... 16], with sizes: [212, 1009, 432, 337, 114, 102, 85, 77, 66, 57, 47, 42, 41, 39, 27, 26, 20, 12] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4081096649169922 seconds Jaccard graph constructed in 0.5636246204376221 seconds Wrote graph to binary file in 0.03511691093444824 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891539 Louvain completed 21 runs in 1.3966548442840576 seconds PhenoGraph complete in 2.418062448501587 seconds Found communities [-1, ... 14], with sizes: [169, 1191, 396, 205, 145, 96, 91, 89, 78, 61, 47, 47, 46, 29, 28, 27] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4078061580657959 seconds Jaccard graph constructed in 0.5556800365447998 seconds Wrote graph to binary file in 0.25050973892211914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895265 Louvain completed 21 runs in 1.4412474632263184 seconds PhenoGraph complete in 2.671192169189453 seconds Found communities [-1, ... 15], with sizes: [193, 1115, 679, 106, 88, 79, 76, 73, 72, 47, 46, 35, 32, 31, 29, 22, 22] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4085850715637207 seconds Jaccard graph constructed in 0.5557441711425781 seconds Wrote graph to binary file in 0.03228044509887695 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889227 After 7 runs, maximum modularity is Q = 0.890463 Louvain completed 27 runs in 1.9141814708709717 seconds PhenoGraph complete in 2.9237189292907715 seconds Found communities [-1, ... 14], with sizes: [231, 1022, 373, 367, 130, 94, 92, 75, 75, 66, 52, 48, 32, 31, 30, 27] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5096926689147949 seconds Jaccard graph constructed in 0.6055824756622314 seconds Wrote graph to binary file in 0.04024457931518555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896855 After 2 runs, maximum modularity is Q = 0.898196 Louvain completed 22 runs in 1.6625196933746338 seconds PhenoGraph complete in 2.8347229957580566 seconds Found communities [-1, ... 15], with sizes: [226, 1080, 335, 320, 152, 97, 77, 77, 77, 60, 50, 46, 43, 41, 27, 26, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4100353717803955 seconds Jaccard graph constructed in 0.5605344772338867 seconds Wrote graph to binary file in 0.034934043884277344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886007 After 17 runs, maximum modularity is Q = 0.887034 Louvain completed 37 runs in 2.4867916107177734 seconds PhenoGraph complete in 3.5050947666168213 seconds Found communities [-1, ... 16], with sizes: [205, 1021, 392, 368, 137, 126, 98, 78, 62, 46, 36, 31, 31, 25, 25, 25, 25, 14] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.509087324142456 seconds Jaccard graph constructed in 0.5716879367828369 seconds Wrote graph to binary file in 0.034432172775268555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89655 Louvain completed 21 runs in 1.4189376831054688 seconds PhenoGraph complete in 2.5461370944976807 seconds Found communities [-1, ... 15], with sizes: [217, 1048, 361, 343, 133, 96, 86, 84, 80, 65, 48, 40, 36, 30, 29, 25, 24] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4082791805267334 seconds Jaccard graph constructed in 0.7765345573425293 seconds Wrote graph to binary file in 0.03472542762756348 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893439 Louvain completed 21 runs in 1.4554274082183838 seconds PhenoGraph complete in 2.6879711151123047 seconds Found communities [-1, ... 14], with sizes: [216, 1238, 515, 131, 104, 83, 80, 79, 62, 48, 41, 37, 36, 25, 25, 25] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41226720809936523 seconds Jaccard graph constructed in 0.5759696960449219 seconds Wrote graph to binary file in 0.03444170951843262 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.894657 Louvain completed 21 runs in 1.3930928707122803 seconds PhenoGraph complete in 2.4279897212982178 seconds Found communities [-1, ... 15], with sizes: [179, 1038, 380, 379, 158, 87, 85, 77, 75, 57, 47, 39, 35, 32, 29, 28, 20] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4079315662384033 seconds Jaccard graph constructed in 0.5689308643341064 seconds Wrote graph to binary file in 0.034728050231933594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897891 Louvain completed 21 runs in 1.4391758441925049 seconds PhenoGraph complete in 2.466240882873535 seconds Found communities [-1, ... 15], with sizes: [206, 1038, 521, 197, 154, 88, 86, 83, 80, 58, 46, 45, 38, 32, 31, 28, 14] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4070625305175781 seconds Jaccard graph constructed in 0.5509645938873291 seconds Wrote graph to binary file in 0.03459048271179199 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.896833 Louvain completed 21 runs in 1.3844196796417236 seconds PhenoGraph complete in 2.3938353061676025 seconds Found communities [-1, ... 18], with sizes: [185, 1065, 366, 338, 117, 85, 77, 75, 75, 65, 48, 41, 36, 32, 27, 26, 26, 25, 20, 16]
sc.pp.normalize_per_cell(D353_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Int2) # log transform the data
D353_Biop_Int2.raw = D353_Biop_Int2 # freeze the object (for later use of the raw state of it)
D353_Biop_Int2 = D353_Biop_Int2[:, D353_Biop_Int2.var['ribo_genes']]
D353_Biop_Int2
View of AnnData object with n_obs × n_vars = 2196 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D354_Biop_Int2 = sc.read_10x_mtx(
'./D354_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Biop_Int2.var_names_make_unique()
D354_Biop_Int2.obs['manip'] = 'D354_Biop_Int2'
D354_Biop_Int2.obs['position'] = 'Intermediate'
D354_Biop_Int2.obs['method'] = 'Biopsy'
D354_Biop_Int2.obs['donor'] = 'D354'
D354_Biop_Int2.obs['name'] = ['D354_Biop_Int2_' + s for s in list(D354_Biop_Int2.obs.index)]
D354_Biop_Int2.obs_names = D354_Biop_Int2.obs['name']
D354_Biop_Int2
... reading from cache file ./cache/D354_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2775 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D354_Biop_Int2, n_top=20)
sc.pp.filter_cells(D354_Biop_Int2, min_genes=0)
mito_genes = D354_Biop_Int2.var_names.str.startswith('MT-')
D354_Biop_Int2.obs['percent_mito'] = np.sum(
D354_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.obs['n_counts'] = D354_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Int2.to_df())
ribo_genes = D354_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Int2.obs['percent_ribo'] = np.sum(
D354_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D354_Biop_Int2, min_genes=500)
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['n_counts'] < 20000, :]
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 51 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D354_Biop_Int2.X, expected_doublet_rate=0.022)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Biop_Int2.obs['doublet_scores'] = doublet_scores
D354_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.26 Detected doublet rate = 0.6% Estimated detectable doublet fraction = 14.5% Overall doublet rate: Expected = 2.2% Estimated = 4.1% Elapsed time: 1.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb525b3c8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9f3bc9e8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Biop_Int2.X).predict()
D354_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6089456081390381 seconds Jaccard graph constructed in 0.5493769645690918 seconds Wrote graph to binary file in 0.047583818435668945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903417 After 3 runs, maximum modularity is Q = 0.904579 Louvain completed 23 runs in 1.9260752201080322 seconds PhenoGraph complete in 3.146135091781616 seconds Found communities [-1, ... 20], with sizes: [243, 1162, 328, 305, 257, 185, 178, 161, 96, 77, 74, 46, 45, 41, 35, 31, 29, 25, 25, 17, 16, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6099333763122559 seconds Jaccard graph constructed in 0.5788750648498535 seconds Wrote graph to binary file in 0.04790472984313965 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906312 After 13 runs, maximum modularity is Q = 0.907492 Louvain completed 33 runs in 2.5305252075195312 seconds PhenoGraph complete in 3.7877907752990723 seconds Found communities [-1, ... 19], with sizes: [227, 1056, 364, 363, 267, 201, 161, 148, 99, 94, 73, 62, 47, 43, 36, 33, 33, 28, 26, 15, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6165721416473389 seconds Jaccard graph constructed in 0.583214282989502 seconds Wrote graph to binary file in 0.2681753635406494 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902316 Louvain completed 21 runs in 1.530259370803833 seconds PhenoGraph complete in 3.0137455463409424 seconds Found communities [-1, ... 17], with sizes: [233, 1165, 348, 305, 294, 180, 173, 165, 117, 86, 72, 45, 44, 43, 31, 30, 23, 21, 13] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.61187744140625 seconds Jaccard graph constructed in 0.6052114963531494 seconds Wrote graph to binary file in 0.050127506256103516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903096 Louvain completed 21 runs in 1.5214214324951172 seconds PhenoGraph complete in 2.80751895904541 seconds Found communities [-1, ... 15], with sizes: [225, 1095, 374, 337, 309, 269, 184, 164, 137, 45, 44, 43, 42, 36, 35, 26, 23] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6149189472198486 seconds Jaccard graph constructed in 0.6141078472137451 seconds Wrote graph to binary file in 0.04991936683654785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908502 Louvain completed 21 runs in 1.5285391807556152 seconds PhenoGraph complete in 2.822324752807617 seconds Found communities [-1, ... 18], with sizes: [252, 1276, 532, 164, 152, 151, 140, 115, 114, 93, 88, 46, 46, 45, 37, 31, 30, 30, 25, 21] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.611790657043457 seconds Jaccard graph constructed in 0.5846893787384033 seconds Wrote graph to binary file in 0.2656435966491699 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903634 After 3 runs, maximum modularity is Q = 0.904811 Louvain completed 23 runs in 1.9008080959320068 seconds PhenoGraph complete in 3.378009796142578 seconds Found communities [-1, ... 18], with sizes: [277, 1062, 366, 325, 248, 187, 185, 157, 116, 115, 45, 44, 40, 40, 40, 36, 33, 28, 24, 20] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6172959804534912 seconds Jaccard graph constructed in 0.5735414028167725 seconds Wrote graph to binary file in 0.04891705513000488 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899241 After 2 runs, maximum modularity is Q = 0.902945 Louvain completed 22 runs in 1.845940113067627 seconds PhenoGraph complete in 3.100114107131958 seconds Found communities [-1, ... 17], with sizes: [244, 1134, 363, 332, 229, 192, 186, 170, 118, 108, 56, 48, 46, 44, 32, 28, 25, 19, 14] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6118593215942383 seconds Jaccard graph constructed in 0.5889749526977539 seconds Wrote graph to binary file in 0.046532630920410156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903312 Louvain completed 21 runs in 1.515474557876587 seconds PhenoGraph complete in 2.7765424251556396 seconds Found communities [-1, ... 18], with sizes: [287, 1084, 353, 340, 211, 196, 172, 159, 159, 101, 77, 46, 43, 40, 30, 25, 21, 18, 15, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6111633777618408 seconds Jaccard graph constructed in 0.5823967456817627 seconds Wrote graph to binary file in 0.288393497467041 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906185 After 3 runs, maximum modularity is Q = 0.907497 Louvain completed 23 runs in 1.8971140384674072 seconds PhenoGraph complete in 3.393420457839966 seconds Found communities [-1, ... 17], with sizes: [271, 1082, 409, 299, 254, 212, 173, 156, 110, 109, 48, 45, 44, 41, 35, 34, 26, 22, 18] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.610567569732666 seconds Jaccard graph constructed in 0.6041157245635986 seconds Wrote graph to binary file in 0.04715108871459961 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90188 Louvain completed 21 runs in 1.5208909511566162 seconds PhenoGraph complete in 2.7960779666900635 seconds Found communities [-1, ... 18], with sizes: [277, 1097, 354, 340, 237, 197, 169, 116, 114, 104, 76, 50, 46, 44, 42, 32, 29, 24, 21, 19] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6107847690582275 seconds Jaccard graph constructed in 0.5874667167663574 seconds Wrote graph to binary file in 0.04792428016662598 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901651 After 4 runs, maximum modularity is Q = 0.902827 Louvain completed 24 runs in 1.9363698959350586 seconds PhenoGraph complete in 3.196136236190796 seconds Found communities [-1, ... 17], with sizes: [256, 1115, 384, 292, 289, 218, 172, 165, 163, 47, 47, 46, 45, 31, 31, 28, 28, 20, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.612783670425415 seconds Jaccard graph constructed in 0.5973565578460693 seconds Wrote graph to binary file in 0.25495386123657227 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904834 After 4 runs, maximum modularity is Q = 0.906356 Louvain completed 24 runs in 1.9502596855163574 seconds PhenoGraph complete in 3.4298713207244873 seconds Found communities [-1, ... 21], with sizes: [241, 1073, 372, 370, 236, 191, 177, 148, 118, 108, 45, 43, 42, 37, 33, 27, 26, 24, 21, 19, 13, 13, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.517777681350708 seconds Jaccard graph constructed in 0.5723652839660645 seconds Wrote graph to binary file in 0.04760384559631348 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899283 Louvain completed 21 runs in 1.5190637111663818 seconds PhenoGraph complete in 2.6748862266540527 seconds Found communities [-1, ... 17], with sizes: [249, 1126, 358, 324, 275, 268, 165, 158, 101, 79, 44, 43, 43, 34, 32, 27, 23, 20, 19] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6115403175354004 seconds Jaccard graph constructed in 0.5914480686187744 seconds Wrote graph to binary file in 0.046744346618652344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904794 Louvain completed 21 runs in 1.5596249103546143 seconds PhenoGraph complete in 2.8239617347717285 seconds Found communities [-1, ... 18], with sizes: [292, 1044, 376, 340, 267, 183, 151, 143, 116, 106, 78, 49, 45, 45, 34, 30, 28, 24, 21, 16] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6164212226867676 seconds Jaccard graph constructed in 0.5935096740722656 seconds Wrote graph to binary file in 0.04588150978088379 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.897071 After 2 runs, maximum modularity is Q = 0.898997 Louvain completed 22 runs in 1.897925853729248 seconds PhenoGraph complete in 3.1674208641052246 seconds Found communities [-1, ... 18], with sizes: [284, 1277, 471, 235, 208, 173, 168, 166, 62, 51, 46, 41, 41, 39, 31, 31, 21, 17, 14, 12] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6152329444885254 seconds Jaccard graph constructed in 0.8159730434417725 seconds Wrote graph to binary file in 0.04668879508972168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901192 After 4 runs, maximum modularity is Q = 0.902879 Louvain completed 24 runs in 1.9839365482330322 seconds PhenoGraph complete in 3.481740951538086 seconds Found communities [-1, ... 17], with sizes: [231, 1120, 409, 351, 233, 189, 178, 163, 155, 75, 46, 41, 41, 40, 31, 30, 23, 19, 13] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6104776859283447 seconds Jaccard graph constructed in 0.5887281894683838 seconds Wrote graph to binary file in 0.04852747917175293 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902573 Louvain completed 21 runs in 1.5776691436767578 seconds PhenoGraph complete in 2.8476829528808594 seconds Found communities [-1, ... 18], with sizes: [266, 1102, 355, 348, 235, 165, 157, 157, 133, 102, 94, 46, 43, 37, 36, 32, 29, 22, 16, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6126482486724854 seconds Jaccard graph constructed in 0.5858221054077148 seconds Wrote graph to binary file in 0.04575514793395996 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902961 Louvain completed 21 runs in 1.546755313873291 seconds PhenoGraph complete in 2.8106656074523926 seconds Found communities [-1, ... 19], with sizes: [260, 1149, 342, 331, 213, 185, 183, 161, 105, 70, 68, 56, 45, 42, 36, 30, 24, 23, 22, 22, 21] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.60927414894104 seconds Jaccard graph constructed in 0.5587124824523926 seconds Wrote graph to binary file in 0.23276758193969727 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903275 Louvain completed 21 runs in 1.5103824138641357 seconds PhenoGraph complete in 2.924783229827881 seconds Found communities [-1, ... 15], with sizes: [265, 1095, 354, 337, 274, 208, 188, 149, 117, 110, 75, 45, 43, 42, 33, 31, 22] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6164610385894775 seconds Jaccard graph constructed in 0.5717639923095703 seconds Wrote graph to binary file in 0.04487180709838867 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902751 After 3 runs, maximum modularity is Q = 0.904469 Louvain completed 23 runs in 1.8957345485687256 seconds PhenoGraph complete in 3.1415188312530518 seconds Found communities [-1, ... 17], with sizes: [228, 1115, 386, 329, 232, 230, 180, 132, 114, 105, 73, 45, 44, 37, 36, 31, 25, 23, 23] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6134498119354248 seconds Jaccard graph constructed in 0.5802252292633057 seconds Wrote graph to binary file in 0.04542064666748047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901305 Louvain completed 21 runs in 1.515362024307251 seconds PhenoGraph complete in 2.768453598022461 seconds Found communities [-1, ... 18], with sizes: [253, 1154, 350, 331, 266, 196, 184, 149, 96, 83, 61, 46, 44, 44, 35, 30, 22, 17, 16, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.615567684173584 seconds Jaccard graph constructed in 0.5754516124725342 seconds Wrote graph to binary file in 0.2666294574737549 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905599 After 12 runs, maximum modularity is Q = 0.906617 Louvain completed 32 runs in 2.4448866844177246 seconds PhenoGraph complete in 3.916701078414917 seconds Found communities [-1, ... 19], with sizes: [268, 1085, 379, 343, 205, 170, 168, 140, 124, 100, 91, 46, 46, 42, 41, 34, 28, 27, 22, 18, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.612835168838501 seconds Jaccard graph constructed in 0.5812299251556396 seconds Wrote graph to binary file in 0.04573798179626465 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90242 Louvain completed 21 runs in 1.488856315612793 seconds PhenoGraph complete in 2.7458574771881104 seconds Found communities [-1, ... 17], with sizes: [235, 1072, 512, 269, 254, 202, 184, 153, 120, 116, 45, 42, 35, 29, 25, 24, 24, 24, 23] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6147117614746094 seconds Jaccard graph constructed in 0.5668911933898926 seconds Wrote graph to binary file in 0.04675579071044922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900681 After 3 runs, maximum modularity is Q = 0.901692 Louvain completed 23 runs in 1.8528432846069336 seconds PhenoGraph complete in 3.095226764678955 seconds Found communities [-1, ... 19], with sizes: [222, 1049, 531, 262, 244, 174, 169, 150, 107, 105, 92, 45, 42, 37, 33, 28, 26, 22, 20, 16, 14] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.611067533493042 seconds Jaccard graph constructed in 0.5879242420196533 seconds Wrote graph to binary file in 0.2729678153991699 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904636 Louvain completed 21 runs in 1.540919303894043 seconds PhenoGraph complete in 3.02738618850708 seconds Found communities [-1, ... 18], with sizes: [243, 1080, 376, 357, 254, 241, 187, 135, 111, 68, 46, 46, 39, 36, 36, 33, 27, 25, 25, 23]
sc.pp.normalize_per_cell(D354_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Int2) # log transform the data
D354_Biop_Int2.raw = D354_Biop_Int2 # freeze the object (for later use of the raw state of it)
D354_Biop_Int2 = D354_Biop_Int2[:, D354_Biop_Int2.var['ribo_genes']]
D354_Biop_Int2
View of AnnData object with n_obs × n_vars = 2711 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D363_Biop_Int2 = sc.read_10x_mtx(
'./D363_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Biop_Int2.var_names_make_unique()
D363_Biop_Int2.obs['manip'] = 'D363_Biop_Int2'
D363_Biop_Int2.obs['position'] = 'Intermediate'
D363_Biop_Int2.obs['method'] = 'Biopsy'
D363_Biop_Int2.obs['donor'] = 'D363'
D363_Biop_Int2.obs['name'] = ['D363_Biop_Int2_' + s for s in list(D363_Biop_Int2.obs.index)]
D363_Biop_Int2.obs_names = D363_Biop_Int2.obs['name']
D363_Biop_Int2
... reading from cache file ./cache/D363_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1290 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D363_Biop_Int2, n_top=20)
sc.pp.filter_cells(D363_Biop_Int2, min_genes=0)
mito_genes = D363_Biop_Int2.var_names.str.startswith('MT-')
D363_Biop_Int2.obs['percent_mito'] = np.sum(
D363_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.obs['n_counts'] = D363_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Int2.to_df())
ribo_genes = D363_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Int2.obs['percent_ribo'] = np.sum(
D363_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D363_Biop_Int2, min_genes=500)
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['n_counts'] < 15000, :]
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 10 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D363_Biop_Int2.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Biop_Int2.obs['doublet_scores'] = doublet_scores
D363_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.11 Detected doublet rate = 0.7% Estimated detectable doublet fraction = 23.5% Overall doublet rate: Expected = 1.1% Estimated = 3.0% Elapsed time: 0.7 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea7e4dac8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea25aa278>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Biop_Int2.X).predict()
D363_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11090898513793945 seconds Jaccard graph constructed in 0.37721800804138184 seconds Wrote graph to binary file in 0.019229888916015625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.846293 Louvain completed 21 runs in 1.2453773021697998 seconds PhenoGraph complete in 1.7667901515960693 seconds Found communities [-1, ... 15], with sizes: [224, 394, 176, 167, 101, 96, 89, 67, 54, 50, 38, 33, 28, 26, 16, 15, 13] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2131812572479248 seconds Jaccard graph constructed in 0.3695356845855713 seconds Wrote graph to binary file in 0.019904375076293945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851624 Louvain completed 21 runs in 1.2541816234588623 seconds PhenoGraph complete in 1.8761091232299805 seconds Found communities [-1, ... 11], with sizes: [268, 444, 285, 120, 88, 78, 76, 61, 44, 42, 31, 30, 20] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21228909492492676 seconds Jaccard graph constructed in 0.4420442581176758 seconds Wrote graph to binary file in 0.018776893615722656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.848414 Louvain completed 21 runs in 1.2108337879180908 seconds PhenoGraph complete in 1.894345760345459 seconds Found communities [-1, ... 13], with sizes: [247, 426, 351, 86, 83, 77, 72, 51, 45, 37, 36, 31, 22, 12, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21122074127197266 seconds Jaccard graph constructed in 0.3942694664001465 seconds Wrote graph to binary file in 0.02224588394165039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.845911 Louvain completed 21 runs in 1.2422690391540527 seconds PhenoGraph complete in 1.8826713562011719 seconds Found communities [-1, ... 13], with sizes: [278, 447, 246, 135, 98, 72, 58, 45, 41, 41, 38, 29, 23, 18, 18] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20770907402038574 seconds Jaccard graph constructed in 0.4313623905181885 seconds Wrote graph to binary file in 0.019024133682250977 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.851339 Louvain completed 21 runs in 1.2186212539672852 seconds PhenoGraph complete in 1.895219087600708 seconds Found communities [-1, ... 11], with sizes: [245, 404, 235, 159, 123, 88, 78, 75, 48, 44, 35, 29, 24] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21138715744018555 seconds Jaccard graph constructed in 0.3624267578125 seconds Wrote graph to binary file in 0.018446683883666992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.846414 After 7 runs, maximum modularity is Q = 0.84756 Louvain completed 27 runs in 1.71342134475708 seconds PhenoGraph complete in 2.3191380500793457 seconds Found communities [-1, ... 14], with sizes: [206, 434, 265, 131, 99, 88, 73, 64, 50, 45, 29, 29, 27, 22, 13, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2106611728668213 seconds Jaccard graph constructed in 0.4227170944213867 seconds Wrote graph to binary file in 0.26633191108703613 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.84827 After 2 runs, maximum modularity is Q = 0.849855 After 14 runs, maximum modularity is Q = 0.850876 Louvain completed 34 runs in 2.289544105529785 seconds PhenoGraph complete in 3.2031667232513428 seconds Found communities [-1, ... 13], with sizes: [234, 444, 259, 145, 83, 73, 69, 63, 48, 44, 38, 33, 21, 18, 15] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21259045600891113 seconds Jaccard graph constructed in 0.45606374740600586 seconds Wrote graph to binary file in 0.02292656898498535 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.848958 Louvain completed 21 runs in 1.2317864894866943 seconds PhenoGraph complete in 1.9321236610412598 seconds Found communities [-1, ... 11], with sizes: [229, 438, 256, 147, 110, 85, 83, 69, 44, 42, 34, 29, 21] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21307706832885742 seconds Jaccard graph constructed in 0.47411417961120605 seconds Wrote graph to binary file in 0.0263826847076416 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.847751 After 2 runs, maximum modularity is Q = 0.848798 Louvain completed 22 runs in 1.5304622650146484 seconds PhenoGraph complete in 2.2559304237365723 seconds Found communities [-1, ... 13], with sizes: [209, 408, 256, 125, 109, 86, 80, 72, 53, 47, 43, 34, 28, 24, 13] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21260690689086914 seconds Jaccard graph constructed in 0.45042896270751953 seconds Wrote graph to binary file in 0.022719144821166992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.854347 Louvain completed 21 runs in 1.2281692028045654 seconds PhenoGraph complete in 1.9236557483673096 seconds Found communities [-1, ... 12], with sizes: [232, 422, 278, 153, 94, 84, 75, 68, 45, 43, 31, 31, 20, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21191763877868652 seconds Jaccard graph constructed in 0.4443166255950928 seconds Wrote graph to binary file in 0.02356243133544922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.84917 Louvain completed 21 runs in 1.2576515674591064 seconds PhenoGraph complete in 1.950350046157837 seconds Found communities [-1, ... 14], with sizes: [218, 436, 260, 137, 98, 77, 69, 66, 44, 40, 38, 33, 21, 21, 18, 11] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21306586265563965 seconds Jaccard graph constructed in 0.4184701442718506 seconds Wrote graph to binary file in 0.02998185157775879 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.848609 Louvain completed 21 runs in 1.3128526210784912 seconds PhenoGraph complete in 1.9944593906402588 seconds Found communities [-1, ... 13], with sizes: [217, 425, 310, 104, 97, 75, 73, 68, 45, 41, 37, 34, 26, 21, 14] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21082472801208496 seconds Jaccard graph constructed in 0.4482419490814209 seconds Wrote graph to binary file in 0.022317171096801758 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.852943 After 2 runs, maximum modularity is Q = 0.85555 After 13 runs, maximum modularity is Q = 0.856678 Louvain completed 33 runs in 2.200714349746704 seconds PhenoGraph complete in 2.8916006088256836 seconds Found communities [-1, ... 14], with sizes: [203, 330, 266, 155, 122, 91, 83, 68, 65, 51, 47, 29, 26, 23, 16, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20737504959106445 seconds Jaccard graph constructed in 0.4564363956451416 seconds Wrote graph to binary file in 0.025628089904785156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.845279 Louvain completed 21 runs in 1.4998633861541748 seconds PhenoGraph complete in 2.2094199657440186 seconds Found communities [-1, ... 13], with sizes: [193, 404, 254, 171, 85, 81, 78, 78, 56, 42, 42, 38, 32, 20, 13] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21024131774902344 seconds Jaccard graph constructed in 0.44397902488708496 seconds Wrote graph to binary file in 0.023524999618530273 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.842858 After 2 runs, maximum modularity is Q = 0.844347 Louvain completed 22 runs in 1.4769408702850342 seconds PhenoGraph complete in 2.1658098697662354 seconds Found communities [-1, ... 12], with sizes: [234, 463, 234, 147, 90, 82, 79, 78, 45, 40, 30, 23, 22, 20] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21358656883239746 seconds Jaccard graph constructed in 0.4476747512817383 seconds Wrote graph to binary file in 0.247084379196167 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.846232 Louvain completed 21 runs in 1.250878095626831 seconds PhenoGraph complete in 2.168264150619507 seconds Found communities [-1, ... 12], with sizes: [248, 420, 333, 102, 100, 78, 67, 53, 46, 40, 35, 28, 25, 12] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2079613208770752 seconds Jaccard graph constructed in 0.41254353523254395 seconds Wrote graph to binary file in 0.02986741065979004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.84982 Louvain completed 21 runs in 1.2374930381774902 seconds PhenoGraph complete in 1.904066801071167 seconds Found communities [-1, ... 15], with sizes: [192, 432, 268, 132, 101, 94, 85, 76, 47, 29, 25, 24, 23, 21, 14, 12, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21439218521118164 seconds Jaccard graph constructed in 0.4584996700286865 seconds Wrote graph to binary file in 0.022737979888916016 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.850532 Louvain completed 21 runs in 1.2434029579162598 seconds PhenoGraph complete in 1.948211908340454 seconds Found communities [-1, ... 14], with sizes: [221, 416, 260, 124, 107, 91, 72, 57, 49, 44, 33, 28, 27, 24, 20, 14] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2076733112335205 seconds Jaccard graph constructed in 0.45208048820495605 seconds Wrote graph to binary file in 0.022835969924926758 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.85432 Louvain completed 21 runs in 1.2385921478271484 seconds PhenoGraph complete in 1.9312427043914795 seconds Found communities [-1, ... 14], with sizes: [229, 425, 269, 125, 97, 95, 74, 54, 49, 45, 38, 28, 21, 15, 12, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21299219131469727 seconds Jaccard graph constructed in 0.4493439197540283 seconds Wrote graph to binary file in 0.024905681610107422 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.847756 After 16 runs, maximum modularity is Q = 0.848802 Louvain completed 36 runs in 2.1637072563171387 seconds PhenoGraph complete in 2.8618645668029785 seconds Found communities [-1, ... 14], with sizes: [252, 419, 230, 147, 94, 94, 73, 71, 46, 37, 31, 30, 23, 18, 11, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21243715286254883 seconds Jaccard graph constructed in 0.4398818016052246 seconds Wrote graph to binary file in 0.022123098373413086 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.850926 Louvain completed 21 runs in 1.2408523559570312 seconds PhenoGraph complete in 1.9251196384429932 seconds Found communities [-1, ... 14], with sizes: [203, 420, 278, 121, 99, 78, 77, 63, 51, 48, 36, 33, 28, 25, 14, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21517205238342285 seconds Jaccard graph constructed in 0.48202037811279297 seconds Wrote graph to binary file in 0.02966141700744629 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.845053 Louvain completed 21 runs in 1.3954365253448486 seconds PhenoGraph complete in 2.1333401203155518 seconds Found communities [-1, ... 12], with sizes: [185, 411, 262, 164, 97, 90, 89, 60, 52, 44, 41, 40, 29, 23] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21347546577453613 seconds Jaccard graph constructed in 0.39672136306762695 seconds Wrote graph to binary file in 0.028667449951171875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.849292 Louvain completed 21 runs in 1.2357735633850098 seconds PhenoGraph complete in 1.886756181716919 seconds Found communities [-1, ... 16], with sizes: [219, 416, 262, 152, 102, 76, 63, 62, 44, 43, 28, 26, 25, 21, 13, 12, 12, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21207165718078613 seconds Jaccard graph constructed in 0.4134039878845215 seconds Wrote graph to binary file in 0.0380253791809082 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.845069 After 2 runs, maximum modularity is Q = 0.846646 Louvain completed 22 runs in 1.5019104480743408 seconds PhenoGraph complete in 2.177577257156372 seconds Found communities [-1, ... 13], with sizes: [236, 422, 240, 181, 97, 77, 72, 61, 48, 39, 28, 28, 23, 22, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2114720344543457 seconds Jaccard graph constructed in 0.6479077339172363 seconds Wrote graph to binary file in 0.032082557678222656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.850514 After 5 runs, maximum modularity is Q = 0.851661 Louvain completed 25 runs in 1.6499974727630615 seconds PhenoGraph complete in 2.5567567348480225 seconds Found communities [-1, ... 12], with sizes: [210, 408, 287, 184, 100, 84, 69, 60, 41, 39, 31, 30, 24, 20]
sc.pp.normalize_per_cell(D363_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Int2) # log transform the data
D363_Biop_Int2.raw = D363_Biop_Int2 # freeze the object (for later use of the raw state of it)
D363_Biop_Int2 = D363_Biop_Int2[:, D363_Biop_Int2.var['ribo_genes']]
D363_Biop_Int2
View of AnnData object with n_obs × n_vars = 1270 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D367_Biop_Int1 = sc.read_10x_mtx(
'./D367_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Biop_Int1.var_names_make_unique()
D367_Biop_Int1.obs['manip'] = 'D367_Biop_Int1'
D367_Biop_Int1.obs['position'] = 'Intermediate'
D367_Biop_Int1.obs['method'] = 'Biopsy'
D367_Biop_Int1.obs['donor'] = 'D367'
D367_Biop_Int1.obs['name'] = ['D367_Biop_Int1_' + s for s in list(D367_Biop_Int1.obs.index)]
D367_Biop_Int1.obs_names = D367_Biop_Int1.obs['name']
D367_Biop_Int1
... reading from cache file ./cache/D367_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2310 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D367_Biop_Int1, n_top=20)
sc.pp.filter_cells(D367_Biop_Int1, min_genes=0)
mito_genes = D367_Biop_Int1.var_names.str.startswith('MT-')
D367_Biop_Int1.obs['percent_mito'] = np.sum(
D367_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.obs['n_counts'] = D367_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Int1.to_df())
ribo_genes = D367_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Int1.obs['percent_ribo'] = np.sum(
D367_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D367_Biop_Int1, min_genes=500)
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['n_counts'] < 20000, :]
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['percent_mito'] < 0.1, :]
filtered out 12 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D367_Biop_Int1.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Biop_Int1.obs['doublet_scores'] = doublet_scores
D367_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.21 Detected doublet rate = 0.6% Estimated detectable doublet fraction = 26.6% Overall doublet rate: Expected = 1.8% Estimated = 2.3% Elapsed time: 1.3 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea865b828>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea236df60>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Biop_Int1.X).predict()
D367_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.514470100402832 seconds Jaccard graph constructed in 0.5651981830596924 seconds Wrote graph to binary file in 0.038552045822143555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890353 Louvain completed 21 runs in 1.4390356540679932 seconds PhenoGraph complete in 2.5758235454559326 seconds Found communities [-1, ... 20], with sizes: [262, 591, 509, 257, 243, 166, 134, 130, 84, 68, 65, 62, 57, 50, 40, 30, 18, 18, 15, 13, 12, 11] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5079054832458496 seconds Jaccard graph constructed in 0.5689420700073242 seconds Wrote graph to binary file in 0.27757787704467773 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885757 Louvain completed 21 runs in 1.4813103675842285 seconds PhenoGraph complete in 2.8528902530670166 seconds Found communities [-1, ... 20], with sizes: [290, 586, 534, 247, 186, 131, 126, 126, 116, 73, 71, 68, 65, 50, 39, 36, 23, 17, 13, 13, 13, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4097909927368164 seconds Jaccard graph constructed in 0.5726964473724365 seconds Wrote graph to binary file in 0.042246103286743164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89165 Louvain completed 21 runs in 1.4535934925079346 seconds PhenoGraph complete in 2.489902973175049 seconds Found communities [-1, ... 20], with sizes: [352, 616, 444, 296, 172, 124, 124, 112, 96, 70, 69, 59, 58, 54, 50, 40, 23, 22, 18, 13, 12, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41039419174194336 seconds Jaccard graph constructed in 0.5943446159362793 seconds Wrote graph to binary file in 0.04384565353393555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891646 After 5 runs, maximum modularity is Q = 0.892677 Louvain completed 25 runs in 1.9274344444274902 seconds PhenoGraph complete in 2.9952476024627686 seconds Found communities [-1, ... 21], with sizes: [287, 596, 471, 282, 202, 137, 124, 110, 94, 85, 69, 63, 59, 52, 49, 39, 26, 22, 18, 15, 13, 11, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41027235984802246 seconds Jaccard graph constructed in 0.5733840465545654 seconds Wrote graph to binary file in 0.042221784591674805 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887762 Louvain completed 21 runs in 1.4715938568115234 seconds PhenoGraph complete in 2.5102217197418213 seconds Found communities [-1, ... 18], with sizes: [341, 617, 448, 282, 179, 155, 130, 126, 77, 71, 67, 62, 62, 60, 50, 39, 23, 17, 16, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41303038597106934 seconds Jaccard graph constructed in 0.5697891712188721 seconds Wrote graph to binary file in 0.04419088363647461 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887707 Louvain completed 21 runs in 1.4782352447509766 seconds PhenoGraph complete in 2.519321918487549 seconds Found communities [-1, ... 18], with sizes: [264, 602, 486, 285, 181, 147, 137, 134, 97, 72, 65, 64, 60, 57, 54, 49, 32, 22, 14, 13] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40918612480163574 seconds Jaccard graph constructed in 0.8528876304626465 seconds Wrote graph to binary file in 0.04215073585510254 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889583 Louvain completed 21 runs in 1.443619966506958 seconds PhenoGraph complete in 2.760995626449585 seconds Found communities [-1, ... 19], with sizes: [301, 589, 473, 326, 176, 125, 124, 117, 84, 83, 73, 63, 58, 54, 48, 39, 33, 22, 21, 13, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40827417373657227 seconds Jaccard graph constructed in 0.5796041488647461 seconds Wrote graph to binary file in 0.04244089126586914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886688 Louvain completed 21 runs in 1.4799628257751465 seconds PhenoGraph complete in 2.5271430015563965 seconds Found communities [-1, ... 17], with sizes: [296, 604, 489, 365, 187, 154, 122, 119, 118, 69, 59, 59, 49, 40, 37, 27, 16, 13, 12] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40851569175720215 seconds Jaccard graph constructed in 0.5735282897949219 seconds Wrote graph to binary file in 0.04412055015563965 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89127 After 12 runs, maximum modularity is Q = 0.892354 Louvain completed 32 runs in 2.340996503829956 seconds PhenoGraph complete in 3.380171060562134 seconds Found communities [-1, ... 19], with sizes: [332, 568, 516, 288, 194, 155, 151, 125, 69, 62, 61, 57, 56, 50, 39, 27, 26, 25, 12, 11, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4080784320831299 seconds Jaccard graph constructed in 0.5750257968902588 seconds Wrote graph to binary file in 0.0429537296295166 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891469 Louvain completed 21 runs in 1.4571173191070557 seconds PhenoGraph complete in 2.498727560043335 seconds Found communities [-1, ... 18], with sizes: [301, 631, 478, 297, 233, 150, 148, 101, 72, 68, 58, 56, 49, 49, 39, 31, 27, 22, 13, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4076077938079834 seconds Jaccard graph constructed in 0.7601118087768555 seconds Wrote graph to binary file in 0.04154253005981445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883312 After 2 runs, maximum modularity is Q = 0.884909 After 7 runs, maximum modularity is Q = 0.886385 Louvain completed 27 runs in 2.2608542442321777 seconds PhenoGraph complete in 3.4828410148620605 seconds Found communities [-1, ... 17], with sizes: [292, 596, 531, 278, 214, 161, 136, 133, 127, 66, 61, 53, 53, 51, 24, 22, 13, 12, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4104807376861572 seconds Jaccard graph constructed in 0.5960385799407959 seconds Wrote graph to binary file in 0.04874396324157715 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887172 After 2 runs, maximum modularity is Q = 0.889383 Louvain completed 22 runs in 1.9729182720184326 seconds PhenoGraph complete in 3.0427701473236084 seconds Found communities [-1, ... 19], with sizes: [324, 622, 464, 323, 165, 154, 112, 110, 77, 69, 67, 57, 55, 48, 47, 39, 37, 25, 16, 13, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41545677185058594 seconds Jaccard graph constructed in 0.5627744197845459 seconds Wrote graph to binary file in 0.04211997985839844 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89236 After 5 runs, maximum modularity is Q = 0.893767 Louvain completed 25 runs in 1.9204761981964111 seconds PhenoGraph complete in 2.955893039703369 seconds Found communities [-1, ... 19], with sizes: [244, 604, 480, 341, 163, 138, 136, 122, 119, 72, 64, 60, 54, 54, 47, 39, 29, 26, 16, 14, 13] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40871596336364746 seconds Jaccard graph constructed in 0.5704357624053955 seconds Wrote graph to binary file in 0.2707843780517578 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889555 Louvain completed 21 runs in 1.4673383235931396 seconds PhenoGraph complete in 2.73282790184021 seconds Found communities [-1, ... 20], with sizes: [271, 604, 483, 322, 128, 120, 118, 110, 109, 86, 80, 77, 61, 55, 55, 39, 34, 25, 18, 16, 13, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40848517417907715 seconds Jaccard graph constructed in 0.5637214183807373 seconds Wrote graph to binary file in 0.04071211814880371 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888875 Louvain completed 21 runs in 1.4741291999816895 seconds PhenoGraph complete in 2.5003631114959717 seconds Found communities [-1, ... 17], with sizes: [290, 596, 505, 318, 178, 172, 152, 129, 126, 75, 62, 61, 48, 41, 25, 18, 13, 13, 13] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4133882522583008 seconds Jaccard graph constructed in 0.5659253597259521 seconds Wrote graph to binary file in 0.04111146926879883 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891283 After 4 runs, maximum modularity is Q = 0.892318 After 9 runs, maximum modularity is Q = 0.893359 Louvain completed 29 runs in 2.357012987136841 seconds PhenoGraph complete in 3.3884224891662598 seconds Found communities [-1, ... 21], with sizes: [282, 616, 496, 263, 152, 131, 123, 121, 111, 84, 78, 66, 59, 51, 40, 39, 31, 22, 20, 13, 13, 13, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41085362434387207 seconds Jaccard graph constructed in 0.5608417987823486 seconds Wrote graph to binary file in 0.04010009765625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886978 Louvain completed 21 runs in 1.4581246376037598 seconds PhenoGraph complete in 2.4833898544311523 seconds Found communities [-1, ... 21], with sizes: [330, 588, 503, 269, 137, 135, 129, 124, 110, 83, 79, 61, 54, 50, 40, 39, 24, 16, 15, 13, 13, 12, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4110555648803711 seconds Jaccard graph constructed in 0.5750141143798828 seconds Wrote graph to binary file in 0.044264793395996094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886068 After 2 runs, maximum modularity is Q = 0.88852 Louvain completed 22 runs in 1.7303948402404785 seconds PhenoGraph complete in 2.7754876613616943 seconds Found communities [-1, ... 19], with sizes: [281, 597, 479, 321, 177, 174, 128, 125, 77, 67, 64, 62, 61, 51, 48, 40, 23, 20, 15, 13, 12] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41065454483032227 seconds Jaccard graph constructed in 0.5838079452514648 seconds Wrote graph to binary file in 0.2743256092071533 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888173 Louvain completed 21 runs in 1.4572603702545166 seconds PhenoGraph complete in 2.741391658782959 seconds Found communities [-1, ... 17], with sizes: [309, 584, 477, 302, 212, 174, 139, 137, 123, 93, 57, 53, 47, 40, 28, 20, 14, 14, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41254210472106934 seconds Jaccard graph constructed in 0.5795567035675049 seconds Wrote graph to binary file in 0.040463924407958984 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888213 Louvain completed 21 runs in 1.458282232284546 seconds PhenoGraph complete in 2.509772777557373 seconds Found communities [-1, ... 15], with sizes: [349, 621, 450, 285, 235, 166, 134, 128, 127, 67, 67, 62, 51, 47, 22, 13, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4081566333770752 seconds Jaccard graph constructed in 0.5740773677825928 seconds Wrote graph to binary file in 0.039682626724243164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893113 Louvain completed 21 runs in 1.4458818435668945 seconds PhenoGraph complete in 2.4829251766204834 seconds Found communities [-1, ... 20], with sizes: [302, 578, 463, 302, 206, 189, 134, 133, 70, 70, 67, 64, 61, 48, 38, 25, 18, 16, 14, 13, 12, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4136834144592285 seconds Jaccard graph constructed in 0.5601158142089844 seconds Wrote graph to binary file in 0.041204214096069336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888787 After 2 runs, maximum modularity is Q = 0.890064 Louvain completed 22 runs in 1.7298526763916016 seconds PhenoGraph complete in 2.7611961364746094 seconds Found communities [-1, ... 17], with sizes: [300, 607, 503, 279, 235, 173, 126, 125, 123, 62, 55, 54, 51, 50, 24, 22, 18, 15, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40808629989624023 seconds Jaccard graph constructed in 0.5153343677520752 seconds Wrote graph to binary file in 0.28693652153015137 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891632 Louvain completed 21 runs in 1.4537806510925293 seconds PhenoGraph complete in 2.691406011581421 seconds Found communities [-1, ... 18], with sizes: [299, 606, 498, 333, 192, 125, 122, 113, 112, 73, 70, 59, 56, 48, 39, 25, 20, 17, 15, 13] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4140186309814453 seconds Jaccard graph constructed in 0.5657174587249756 seconds Wrote graph to binary file in 0.0407099723815918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882774 Louvain completed 21 runs in 1.4694406986236572 seconds PhenoGraph complete in 2.505589723587036 seconds Found communities [-1, ... 17], with sizes: [234, 583, 528, 345, 172, 147, 138, 133, 123, 87, 65, 62, 49, 47, 40, 33, 25, 13, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4104888439178467 seconds Jaccard graph constructed in 0.5688419342041016 seconds Wrote graph to binary file in 0.04162716865539551 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890672 Louvain completed 21 runs in 1.441237449645996 seconds PhenoGraph complete in 2.476186513900757 seconds Found communities [-1, ... 18], with sizes: [276, 669, 454, 320, 168, 136, 132, 126, 103, 92, 72, 57, 53, 46, 40, 23, 23, 18, 14, 13]
sc.pp.normalize_per_cell(D367_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Int1) # log transform the data
D367_Biop_Int1.raw = D367_Biop_Int1 # freeze the object (for later use of the raw state of it)
D367_Biop_Int1 = D367_Biop_Int1[:, D367_Biop_Int1.var['ribo_genes']]
D367_Biop_Int1
View of AnnData object with n_obs × n_vars = 2268 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D372_Biop_Int1 = sc.read_10x_mtx(
'./D372_Biop_Int1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Int1.var_names_make_unique()
D372_Biop_Int1.obs['manip'] = 'D372_Biop_Int1'
D372_Biop_Int1.obs['position'] = 'Intermediate'
D372_Biop_Int1.obs['method'] = 'Biopsy'
D372_Biop_Int1.obs['donor'] = 'D372'
D372_Biop_Int1.obs['name'] = ['D372_Biop_Int1_' + s for s in list(D372_Biop_Int1.obs.index)]
D372_Biop_Int1.obs_names = D372_Biop_Int1.obs['name']
D372_Biop_Int1
... reading from cache file ./cache/D372_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1255 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D372_Biop_Int1, n_top=20)
sc.pp.filter_cells(D372_Biop_Int1, min_genes=0)
mito_genes = D372_Biop_Int1.var_names.str.startswith('MT-')
D372_Biop_Int1.obs['percent_mito'] = np.sum(
D372_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.obs['n_counts'] = D372_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int1.to_df())
ribo_genes = D372_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int1.obs['percent_ribo'] = np.sum(
D372_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D372_Biop_Int1, min_genes=500)
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['n_counts'] < 20000, :]
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['percent_mito'] < 0.2, :]
filtered out 8 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D372_Biop_Int1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Int1.obs['doublet_scores'] = doublet_scores
D372_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.11 Detected doublet rate = 0.5% Estimated detectable doublet fraction = 26.9% Overall doublet rate: Expected = 1.1% Estimated = 1.8% Elapsed time: 0.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1fab1d0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea2940550>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Int1.X).predict()
D372_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11031770706176758 seconds Jaccard graph constructed in 0.4518752098083496 seconds Wrote graph to binary file in 0.02142500877380371 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865176 After 14 runs, maximum modularity is Q = 0.866203 Louvain completed 34 runs in 2.141688346862793 seconds PhenoGraph complete in 2.7349908351898193 seconds Found communities [-1, ... 17], with sizes: [263, 385, 219, 84, 79, 65, 57, 53, 52, 48, 48, 37, 35, 34, 30, 26, 13, 11, 11] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21302247047424316 seconds Jaccard graph constructed in 0.4364047050476074 seconds Wrote graph to binary file in 0.30630922317504883 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869238 Louvain completed 21 runs in 1.304117202758789 seconds PhenoGraph complete in 2.2735512256622314 seconds Found communities [-1, ... 15], with sizes: [244, 377, 302, 87, 72, 69, 60, 59, 49, 46, 44, 39, 29, 25, 18, 17, 13] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2141258716583252 seconds Jaccard graph constructed in 0.457120418548584 seconds Wrote graph to binary file in 0.026486635208129883 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.858513 After 2 runs, maximum modularity is Q = 0.860467 After 3 runs, maximum modularity is Q = 0.861709 Louvain completed 23 runs in 1.8500421047210693 seconds PhenoGraph complete in 2.556292772293091 seconds Found communities [-1, ... 17], with sizes: [240, 359, 201, 139, 72, 72, 61, 61, 51, 51, 47, 37, 36, 34, 34, 15, 15, 13, 12] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2118527889251709 seconds Jaccard graph constructed in 0.4980463981628418 seconds Wrote graph to binary file in 0.03323674201965332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.865995 Louvain completed 21 runs in 1.4247725009918213 seconds PhenoGraph complete in 2.182901382446289 seconds Found communities [-1, ... 13], with sizes: [194, 449, 240, 177, 77, 62, 50, 49, 48, 48, 41, 39, 32, 32, 12] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21134185791015625 seconds Jaccard graph constructed in 0.49370455741882324 seconds Wrote graph to binary file in 0.028309106826782227 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.863751 Louvain completed 21 runs in 1.3193938732147217 seconds PhenoGraph complete in 2.0624914169311523 seconds Found communities [-1, ... 12], with sizes: [210, 414, 321, 116, 94, 79, 58, 54, 50, 43, 34, 32, 28, 17] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21464848518371582 seconds Jaccard graph constructed in 0.41809606552124023 seconds Wrote graph to binary file in 0.03922891616821289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.869476 After 5 runs, maximum modularity is Q = 0.871083 Louvain completed 25 runs in 1.7431824207305908 seconds PhenoGraph complete in 2.4311368465423584 seconds Found communities [-1, ... 15], with sizes: [242, 369, 229, 116, 84, 66, 65, 61, 56, 47, 44, 41, 38, 31, 27, 22, 12] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21194982528686523 seconds Jaccard graph constructed in 0.47103381156921387 seconds Wrote graph to binary file in 0.027423858642578125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.864778 After 2 runs, maximum modularity is Q = 0.86664 Louvain completed 22 runs in 1.5547842979431152 seconds PhenoGraph complete in 2.2745959758758545 seconds Found communities [-1, ... 14], with sizes: [216, 419, 188, 125, 115, 89, 79, 70, 44, 43, 35, 33, 32, 29, 21, 12] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2141737937927246 seconds Jaccard graph constructed in 0.40814638137817383 seconds Wrote graph to binary file in 0.04988431930541992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.855954 After 2 runs, maximum modularity is Q = 0.85784 After 3 runs, maximum modularity is Q = 0.858869 After 18 runs, maximum modularity is Q = 0.859884 Louvain completed 38 runs in 2.779226779937744 seconds PhenoGraph complete in 3.462613821029663 seconds Found communities [-1, ... 12], with sizes: [262, 342, 223, 162, 104, 83, 76, 76, 70, 39, 39, 35, 26, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21236109733581543 seconds Jaccard graph constructed in 0.460660457611084 seconds Wrote graph to binary file in 0.026329755783081055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.864972 Louvain completed 21 runs in 1.2901618480682373 seconds PhenoGraph complete in 1.9989025592803955 seconds Found communities [-1, ... 14], with sizes: [243, 447, 347, 72, 69, 59, 48, 45, 43, 39, 34, 32, 29, 15, 14, 14] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2124948501586914 seconds Jaccard graph constructed in 0.4669172763824463 seconds Wrote graph to binary file in 0.026116132736206055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.87476 Louvain completed 21 runs in 1.2778146266937256 seconds PhenoGraph complete in 1.9951555728912354 seconds Found communities [-1, ... 15], with sizes: [223, 357, 323, 107, 91, 64, 57, 45, 42, 41, 41, 40, 30, 29, 26, 22, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21051359176635742 seconds Jaccard graph constructed in 0.4531991481781006 seconds Wrote graph to binary file in 0.2542264461517334 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.866208 Louvain completed 21 runs in 1.2684473991394043 seconds PhenoGraph complete in 2.1980559825897217 seconds Found communities [-1, ... 17], with sizes: [188, 458, 218, 88, 80, 63, 53, 53, 47, 43, 42, 40, 36, 29, 29, 26, 24, 18, 15] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2104489803314209 seconds Jaccard graph constructed in 0.4701554775238037 seconds Wrote graph to binary file in 0.025290489196777344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.8742 After 2 runs, maximum modularity is Q = 0.875348 Louvain completed 22 runs in 1.5572395324707031 seconds PhenoGraph complete in 2.274151563644409 seconds Found communities [-1, ... 18], with sizes: [228, 329, 212, 129, 78, 75, 66, 62, 54, 47, 47, 41, 39, 34, 31, 26, 18, 12, 11, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2123584747314453 seconds Jaccard graph constructed in 0.4525775909423828 seconds Wrote graph to binary file in 0.024283647537231445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.875277 Louvain completed 21 runs in 1.306823492050171 seconds PhenoGraph complete in 2.004891872406006 seconds Found communities [-1, ... 15], with sizes: [261, 365, 208, 99, 87, 73, 65, 51, 49, 46, 45, 42, 42, 37, 35, 28, 17] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21095514297485352 seconds Jaccard graph constructed in 0.45581841468811035 seconds Wrote graph to binary file in 0.024862051010131836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.866302 Louvain completed 21 runs in 1.2976984977722168 seconds PhenoGraph complete in 2.0001683235168457 seconds Found communities [-1, ... 16], with sizes: [244, 356, 217, 141, 101, 79, 68, 59, 48, 46, 41, 41, 35, 20, 18, 13, 12, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2138512134552002 seconds Jaccard graph constructed in 0.4093482494354248 seconds Wrote graph to binary file in 0.052251577377319336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.862088 Louvain completed 21 runs in 1.2953226566314697 seconds PhenoGraph complete in 1.9828617572784424 seconds Found communities [-1, ... 17], with sizes: [222, 409, 203, 169, 77, 73, 61, 54, 50, 46, 34, 31, 30, 20, 17, 17, 13, 12, 12] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2129678726196289 seconds Jaccard graph constructed in 0.46299242973327637 seconds Wrote graph to binary file in 0.023497343063354492 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.861761 Louvain completed 21 runs in 1.2902758121490479 seconds PhenoGraph complete in 1.9996886253356934 seconds Found communities [-1, ... 16], with sizes: [226, 346, 202, 164, 108, 84, 63, 62, 47, 43, 42, 41, 33, 28, 18, 18, 13, 12] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21132779121398926 seconds Jaccard graph constructed in 0.4584939479827881 seconds Wrote graph to binary file in 0.024595975875854492 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.861196 After 5 runs, maximum modularity is Q = 0.86278 Louvain completed 25 runs in 1.7095210552215576 seconds PhenoGraph complete in 2.4141924381256104 seconds Found communities [-1, ... 15], with sizes: [221, 407, 211, 105, 102, 82, 76, 67, 51, 44, 41, 41, 34, 26, 19, 12, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21327757835388184 seconds Jaccard graph constructed in 0.4623739719390869 seconds Wrote graph to binary file in 0.02399420738220215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.861037 Louvain completed 21 runs in 1.2983441352844238 seconds PhenoGraph complete in 2.0079009532928467 seconds Found communities [-1, ... 15], with sizes: [243, 325, 229, 117, 87, 87, 83, 60, 58, 50, 41, 38, 37, 36, 22, 22, 15] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21152997016906738 seconds Jaccard graph constructed in 0.46665406227111816 seconds Wrote graph to binary file in 0.02480936050415039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.863013 Louvain completed 21 runs in 1.2821424007415771 seconds PhenoGraph complete in 1.995488166809082 seconds Found communities [-1, ... 14], with sizes: [257, 349, 198, 111, 103, 94, 91, 81, 73, 36, 34, 32, 32, 32, 15, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21063899993896484 seconds Jaccard graph constructed in 0.45795249938964844 seconds Wrote graph to binary file in 0.02392888069152832 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.860467 Louvain completed 21 runs in 1.2803103923797607 seconds PhenoGraph complete in 1.9826774597167969 seconds Found communities [-1, ... 16], with sizes: [229, 344, 224, 130, 93, 88, 87, 57, 49, 41, 38, 37, 35, 29, 29, 14, 13, 13] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21447324752807617 seconds Jaccard graph constructed in 0.6688611507415771 seconds Wrote graph to binary file in 0.03195953369140625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.864132 After 2 runs, maximum modularity is Q = 0.865931 Louvain completed 22 runs in 1.5954487323760986 seconds PhenoGraph complete in 2.5220508575439453 seconds Found communities [-1, ... 14], with sizes: [231, 392, 183, 96, 93, 88, 88, 78, 77, 67, 41, 38, 29, 23, 14, 12] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21268606185913086 seconds Jaccard graph constructed in 0.4194824695587158 seconds Wrote graph to binary file in 0.029398441314697266 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.858512 After 2 runs, maximum modularity is Q = 0.859882 Louvain completed 22 runs in 1.5828559398651123 seconds PhenoGraph complete in 2.2718589305877686 seconds Found communities [-1, ... 15], with sizes: [262, 332, 213, 137, 120, 92, 66, 58, 50, 50, 37, 33, 32, 28, 14, 13, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2121415138244629 seconds Jaccard graph constructed in 0.4728724956512451 seconds Wrote graph to binary file in 0.025281906127929688 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.85999 After 2 runs, maximum modularity is Q = 0.861145 Louvain completed 22 runs in 1.7052440643310547 seconds PhenoGraph complete in 2.427703619003296 seconds Found communities [-1, ... 15], with sizes: [240, 341, 211, 129, 128, 81, 78, 56, 53, 45, 43, 38, 33, 28, 19, 16, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2129504680633545 seconds Jaccard graph constructed in 0.4776742458343506 seconds Wrote graph to binary file in 0.023955106735229492 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.863159 Louvain completed 21 runs in 1.314345359802246 seconds PhenoGraph complete in 2.0375468730926514 seconds Found communities [-1, ... 15], with sizes: [251, 350, 228, 115, 113, 72, 65, 63, 46, 45, 42, 41, 40, 33, 17, 15, 14] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2104799747467041 seconds Jaccard graph constructed in 0.4557795524597168 seconds Wrote graph to binary file in 0.023629426956176758 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.862927 Louvain completed 21 runs in 1.3198494911193848 seconds PhenoGraph complete in 2.019496202468872 seconds Found communities [-1, ... 14], with sizes: [267, 446, 210, 98, 83, 80, 62, 56, 48, 44, 42, 34, 31, 22, 14, 13]
sc.pp.normalize_per_cell(D372_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int1) # log transform the data
D372_Biop_Int1.raw = D372_Biop_Int1 # freeze the object (for later use of the raw state of it)
D372_Biop_Int1 = D372_Biop_Int1[:, D372_Biop_Int1.var['ribo_genes']]
D372_Biop_Int1
View of AnnData object with n_obs × n_vars = 1240 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D372_Biop_Int2 = sc.read_10x_mtx(
'./D372_Biop_Int2/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Biop_Int2.var_names_make_unique()
D372_Biop_Int2.obs['manip'] = 'D372_Biop_Int2'
D372_Biop_Int2.obs['position'] = 'Intermediate'
D372_Biop_Int2.obs['method'] = 'Biopsy'
D372_Biop_Int2.obs['donor'] = 'D372'
D372_Biop_Int2.obs['name'] = ['D372_Biop_Int2_' + s for s in list(D372_Biop_Int2.obs.index)]
D372_Biop_Int2.obs_names = D372_Biop_Int2.obs['name']
D372_Biop_Int2
... reading from cache file ./cache/D372_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 4003 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D372_Biop_Int2, n_top=20)
sc.pp.filter_cells(D372_Biop_Int2, min_genes=0)
mito_genes = D372_Biop_Int2.var_names.str.startswith('MT-')
D372_Biop_Int2.obs['percent_mito'] = np.sum(
D372_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.obs['n_counts'] = D372_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int2.to_df())
ribo_genes = D372_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int2.obs['percent_ribo'] = np.sum(
D372_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D372_Biop_Int2, min_genes=500)
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['n_counts'] < 20000, :]
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 25 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D372_Biop_Int2.X, expected_doublet_rate=0.031)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Int2.obs['doublet_scores'] = doublet_scores
D372_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.25 Detected doublet rate = 1.3% Estimated detectable doublet fraction = 27.9% Overall doublet rate: Expected = 3.1% Estimated = 4.8% Elapsed time: 2.9 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbb42898>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1901a20>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Int2.X).predict()
D372_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8117721080780029 seconds Jaccard graph constructed in 1.0435481071472168 seconds Wrote graph to binary file in 0.08883523941040039 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935617 After 11 runs, maximum modularity is Q = 0.93678 Louvain completed 31 runs in 2.9441475868225098 seconds PhenoGraph complete in 4.906459808349609 seconds Found communities [-1, ... 29], with sizes: [168, 849, 797, 383, 343, 269, 268, 243, 240, 215, 120, 113, 98, 91, 79, 78, 74, 62, 61, 55, 50, 49, 44, 36, 35, 34, 32, 17, 14, 13, 13] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8170309066772461 seconds Jaccard graph constructed in 0.7652695178985596 seconds Wrote graph to binary file in 0.3165557384490967 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934101 Louvain completed 21 runs in 1.9748854637145996 seconds PhenoGraph complete in 3.891366481781006 seconds Found communities [-1, ... 28], with sizes: [174, 1031, 671, 398, 309, 265, 251, 209, 191, 157, 142, 137, 109, 91, 87, 83, 80, 79, 62, 60, 59, 59, 52, 42, 38, 36, 30, 17, 13, 11] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8148114681243896 seconds Jaccard graph constructed in 0.7761123180389404 seconds Wrote graph to binary file in 0.0947563648223877 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934828 After 14 runs, maximum modularity is Q = 0.936188 Louvain completed 34 runs in 3.187971830368042 seconds PhenoGraph complete in 4.892239332199097 seconds Found communities [-1, ... 30], with sizes: [165, 913, 866, 449, 286, 231, 228, 225, 165, 134, 120, 120, 117, 89, 81, 74, 71, 65, 64, 60, 57, 55, 54, 47, 38, 34, 32, 30, 29, 18, 14, 12] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.2125005722045898 seconds Jaccard graph constructed in 0.7652847766876221 seconds Wrote graph to binary file in 0.0940396785736084 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935108 Louvain completed 21 runs in 1.900214433670044 seconds PhenoGraph complete in 3.9911155700683594 seconds Found communities [-1, ... 27], with sizes: [161, 1628, 342, 311, 287, 259, 226, 211, 195, 193, 170, 116, 89, 85, 77, 72, 65, 60, 57, 57, 56, 50, 40, 32, 31, 31, 17, 14, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.221062421798706 seconds Jaccard graph constructed in 1.015894889831543 seconds Wrote graph to binary file in 0.09170222282409668 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934143 Louvain completed 21 runs in 1.893204689025879 seconds PhenoGraph complete in 4.241094350814819 seconds Found communities [-1, ... 28], with sizes: [143, 905, 868, 467, 337, 234, 230, 220, 165, 126, 116, 108, 99, 96, 89, 78, 75, 69, 65, 65, 61, 61, 50, 46, 45, 39, 31, 27, 17, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.213078498840332 seconds Jaccard graph constructed in 0.7669408321380615 seconds Wrote graph to binary file in 0.09694147109985352 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933835 Louvain completed 21 runs in 2.009591817855835 seconds PhenoGraph complete in 4.103895425796509 seconds Found communities [-1, ... 28], with sizes: [167, 914, 814, 468, 353, 263, 254, 198, 197, 167, 121, 114, 111, 87, 86, 86, 69, 63, 61, 52, 44, 43, 40, 32, 31, 29, 28, 25, 15, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9172782897949219 seconds Jaccard graph constructed in 0.9897208213806152 seconds Wrote graph to binary file in 0.09256529808044434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.93334 Louvain completed 21 runs in 1.8727903366088867 seconds PhenoGraph complete in 3.887312889099121 seconds Found communities [-1, ... 28], with sizes: [132, 1632, 325, 279, 259, 258, 255, 221, 217, 206, 120, 108, 89, 86, 78, 77, 67, 63, 61, 58, 51, 51, 43, 35, 33, 33, 31, 29, 29, 17] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8165619373321533 seconds Jaccard graph constructed in 0.7624201774597168 seconds Wrote graph to binary file in 0.30907201766967773 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935205 After 10 runs, maximum modularity is Q = 0.936348 Louvain completed 30 runs in 2.844038724899292 seconds PhenoGraph complete in 4.750696659088135 seconds Found communities [-1, ... 28], with sizes: [145, 876, 780, 453, 365, 285, 249, 247, 198, 140, 122, 112, 108, 90, 88, 87, 79, 78, 63, 52, 52, 50, 46, 35, 35, 33, 31, 17, 16, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9124925136566162 seconds Jaccard graph constructed in 0.7728259563446045 seconds Wrote graph to binary file in 0.09240317344665527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933658 Louvain completed 21 runs in 1.9913523197174072 seconds PhenoGraph complete in 3.7952184677124023 seconds Found communities [-1, ... 26], with sizes: [158, 864, 857, 491, 461, 383, 215, 191, 133, 114, 114, 104, 88, 78, 77, 75, 69, 63, 60, 60, 54, 54, 46, 39, 36, 29, 19, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.012800931930542 seconds Jaccard graph constructed in 0.7807364463806152 seconds Wrote graph to binary file in 0.35766029357910156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.938331 Louvain completed 21 runs in 1.9387297630310059 seconds PhenoGraph complete in 4.108801603317261 seconds Found communities [-1, ... 28], with sizes: [199, 946, 825, 429, 320, 289, 248, 215, 163, 122, 109, 105, 101, 88, 81, 78, 75, 71, 61, 59, 56, 52, 50, 45, 42, 36, 33, 20, 13, 12] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9129884243011475 seconds Jaccard graph constructed in 0.7538411617279053 seconds Wrote graph to binary file in 0.09158706665039062 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935929 Louvain completed 21 runs in 1.9764914512634277 seconds PhenoGraph complete in 3.7522823810577393 seconds Found communities [-1, ... 30], with sizes: [145, 886, 877, 342, 284, 252, 242, 221, 199, 195, 125, 115, 103, 99, 89, 77, 76, 74, 65, 61, 57, 56, 51, 50, 40, 37, 32, 31, 18, 17, 14, 13] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.821117639541626 seconds Jaccard graph constructed in 0.7486910820007324 seconds Wrote graph to binary file in 0.3156290054321289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935243 Louvain completed 21 runs in 1.900001049041748 seconds PhenoGraph complete in 3.801069974899292 seconds Found communities [-1, ... 28], with sizes: [140, 875, 864, 348, 278, 277, 249, 241, 235, 206, 136, 123, 121, 89, 77, 76, 69, 61, 59, 58, 57, 56, 50, 45, 37, 32, 30, 20, 18, 16] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.027388334274292 seconds Jaccard graph constructed in 0.7994790077209473 seconds Wrote graph to binary file in 0.10189247131347656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933372 After 15 runs, maximum modularity is Q = 0.934527 Louvain completed 35 runs in 3.5184009075164795 seconds PhenoGraph complete in 5.469161033630371 seconds Found communities [-1, ... 28], with sizes: [175, 899, 864, 434, 327, 247, 240, 236, 196, 124, 124, 107, 95, 93, 84, 80, 75, 64, 61, 59, 59, 55, 52, 47, 34, 32, 32, 19, 17, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0228221416473389 seconds Jaccard graph constructed in 0.7615830898284912 seconds Wrote graph to binary file in 0.09098577499389648 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933012 After 2 runs, maximum modularity is Q = 0.934076 Louvain completed 22 runs in 2.399656057357788 seconds PhenoGraph complete in 4.292960166931152 seconds Found communities [-1, ... 27], with sizes: [149, 947, 790, 474, 448, 286, 261, 229, 125, 118, 118, 112, 96, 88, 78, 76, 67, 61, 60, 59, 57, 49, 40, 32, 31, 29, 27, 25, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8194582462310791 seconds Jaccard graph constructed in 1.0443758964538574 seconds Wrote graph to binary file in 0.09239006042480469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.93494 Louvain completed 21 runs in 1.9768857955932617 seconds PhenoGraph complete in 3.9492082595825195 seconds Found communities [-1, ... 28], with sizes: [178, 941, 776, 464, 442, 363, 240, 228, 116, 116, 99, 87, 84, 84, 83, 80, 78, 61, 53, 53, 52, 46, 38, 36, 35, 32, 30, 25, 12, 11] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0186831951141357 seconds Jaccard graph constructed in 0.7586953639984131 seconds Wrote graph to binary file in 0.0899960994720459 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.936074 After 7 runs, maximum modularity is Q = 0.937143 Louvain completed 27 runs in 2.5816171169281006 seconds PhenoGraph complete in 4.467833757400513 seconds Found communities [-1, ... 28], with sizes: [182, 935, 786, 422, 322, 266, 242, 235, 229, 128, 126, 115, 89, 85, 85, 83, 75, 72, 62, 61, 57, 56, 46, 38, 37, 34, 33, 17, 14, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9122560024261475 seconds Jaccard graph constructed in 1.0107040405273438 seconds Wrote graph to binary file in 0.0894930362701416 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.93546 Louvain completed 21 runs in 1.9582693576812744 seconds PhenoGraph complete in 3.987028121948242 seconds Found communities [-1, ... 31], with sizes: [182, 856, 850, 482, 374, 302, 299, 226, 115, 107, 106, 90, 87, 83, 82, 79, 77, 63, 59, 51, 45, 45, 41, 36, 34, 32, 28, 28, 24, 19, 16, 13, 12] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8124711513519287 seconds Jaccard graph constructed in 0.7607028484344482 seconds Wrote graph to binary file in 0.3139994144439697 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933227 Louvain completed 21 runs in 1.9329650402069092 seconds PhenoGraph complete in 3.838634490966797 seconds Found communities [-1, ... 27], with sizes: [150, 932, 844, 357, 279, 266, 251, 232, 225, 202, 194, 121, 103, 91, 81, 78, 66, 61, 56, 53, 48, 42, 41, 36, 34, 33, 25, 22, 20] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0123755931854248 seconds Jaccard graph constructed in 0.7377088069915771 seconds Wrote graph to binary file in 0.09003496170043945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.93549 Louvain completed 21 runs in 1.9168570041656494 seconds PhenoGraph complete in 3.7733287811279297 seconds Found communities [-1, ... 28], with sizes: [178, 878, 847, 433, 341, 296, 254, 239, 161, 136, 121, 119, 105, 90, 79, 76, 67, 65, 61, 56, 53, 52, 47, 46, 35, 32, 31, 17, 16, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.81166672706604 seconds Jaccard graph constructed in 0.7489631175994873 seconds Wrote graph to binary file in 0.0907747745513916 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934919 After 5 runs, maximum modularity is Q = 0.936508 Louvain completed 25 runs in 2.493182897567749 seconds PhenoGraph complete in 4.162370681762695 seconds Found communities [-1, ... 27], with sizes: [162, 933, 818, 431, 379, 329, 256, 243, 183, 129, 120, 102, 91, 90, 80, 78, 75, 61, 51, 43, 39, 39, 38, 35, 33, 31, 30, 25, 19] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9126756191253662 seconds Jaccard graph constructed in 0.9736266136169434 seconds Wrote graph to binary file in 0.09059786796569824 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.933951 After 3 runs, maximum modularity is Q = 0.935124 Louvain completed 23 runs in 2.36483097076416 seconds PhenoGraph complete in 4.35871696472168 seconds Found communities [-1, ... 27], with sizes: [169, 947, 786, 353, 288, 269, 262, 247, 225, 210, 122, 114, 104, 89, 88, 81, 80, 74, 60, 54, 54, 46, 46, 41, 37, 36, 33, 17, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.011352777481079 seconds Jaccard graph constructed in 0.7464473247528076 seconds Wrote graph to binary file in 0.3128812313079834 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.936431 Louvain completed 21 runs in 1.9111721515655518 seconds PhenoGraph complete in 4.000088691711426 seconds Found communities [-1, ... 26], with sizes: [167, 913, 832, 460, 455, 312, 252, 237, 220, 113, 110, 97, 87, 77, 77, 74, 60, 55, 54, 49, 47, 37, 36, 33, 32, 27, 17, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8121376037597656 seconds Jaccard graph constructed in 0.7482364177703857 seconds Wrote graph to binary file in 0.08965182304382324 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934755 Louvain completed 21 runs in 1.8797099590301514 seconds PhenoGraph complete in 3.5465455055236816 seconds Found communities [-1, ... 29], with sizes: [173, 891, 853, 331, 279, 265, 264, 257, 237, 216, 138, 116, 109, 92, 79, 74, 71, 64, 61, 54, 49, 42, 34, 33, 31, 30, 30, 27, 19, 13, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.912355899810791 seconds Jaccard graph constructed in 0.7591440677642822 seconds Wrote graph to binary file in 0.09034132957458496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.935543 Louvain completed 21 runs in 1.9075205326080322 seconds PhenoGraph complete in 3.6860432624816895 seconds Found communities [-1, ... 28], with sizes: [125, 909, 860, 343, 296, 259, 242, 232, 230, 222, 125, 113, 111, 90, 85, 82, 78, 70, 61, 59, 57, 55, 44, 41, 41, 39, 31, 18, 14, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0124380588531494 seconds Jaccard graph constructed in 0.7433838844299316 seconds Wrote graph to binary file in 0.3591461181640625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.934186 Louvain completed 21 runs in 1.9338464736938477 seconds PhenoGraph complete in 4.067973852157593 seconds Found communities [-1, ... 27], with sizes: [146, 901, 852, 446, 302, 245, 244, 215, 191, 131, 128, 127, 112, 107, 93, 81, 80, 74, 64, 63, 62, 55, 48, 41, 38, 36, 31, 19, 11]
sc.pp.normalize_per_cell(D372_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int2) # log transform the data
D372_Biop_Int2.raw = D372_Biop_Int2 # freeze the object (for later use of the raw state of it)
D372_Biop_Int2 = D372_Biop_Int2[:, D372_Biop_Int2.var['ribo_genes']]
D372_Biop_Int2
View of AnnData object with n_obs × n_vars = 3955 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D326_Brus_Dis1 = sc.read_10x_mtx(
'./D326_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D326_Brus_Dis1.var_names_make_unique()
D326_Brus_Dis1.obs['manip'] = 'D326_Brus_Dis1'
D326_Brus_Dis1.obs['position'] = 'Distal'
D326_Brus_Dis1.obs['method'] = 'Brushing'
D326_Brus_Dis1.obs['donor'] = 'D326'
D326_Brus_Dis1.obs['name'] = ['D326_Brus_Dis1_' + s for s in list(D326_Brus_Dis1.obs.index)]
D326_Brus_Dis1.obs_names = D326_Brus_Dis1.obs['name']
D326_Brus_Dis1
... reading from cache file ./cache/D326_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1250 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D326_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=0)
mito_genes = D326_Brus_Dis1.var_names.str.startswith('MT-')
D326_Brus_Dis1.obs['percent_mito'] = np.sum(
D326_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.obs['n_counts'] = D326_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Brus_Dis1.to_df())
ribo_genes = D326_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D326_Brus_Dis1.obs['percent_ribo'] = np.sum(
D326_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D326_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=500)
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['n_counts'] < 15000, :]
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['percent_mito'] < 0.25 , :]
filtered out 126 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D326_Brus_Dis1.X, expected_doublet_rate=0.01)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D326_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.09 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 32.9% Overall doublet rate: Expected = 1.0% Estimated = 1.4% Elapsed time: 0.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea7e52160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea08d2d68>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Brus_Dis1.X).predict()
D326_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21236729621887207 seconds Jaccard graph constructed in 0.3849825859069824 seconds Wrote graph to binary file in 0.026634931564331055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885995 After 2 runs, maximum modularity is Q = 0.887532 Louvain completed 22 runs in 1.5970864295959473 seconds PhenoGraph complete in 2.232036828994751 seconds Found communities [-1, ... 15], with sizes: [175, 276, 195, 102, 99, 82, 76, 63, 57, 49, 48, 40, 31, 30, 25, 22, 22] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21405839920043945 seconds Jaccard graph constructed in 0.45954298973083496 seconds Wrote graph to binary file in 0.02256298065185547 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882894 Louvain completed 21 runs in 1.3038721084594727 seconds PhenoGraph complete in 2.0111072063446045 seconds Found communities [-1, ... 17], with sizes: [160, 222, 196, 95, 82, 81, 74, 66, 61, 59, 56, 50, 35, 33, 31, 29, 23, 22, 17] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21121788024902344 seconds Jaccard graph constructed in 0.38576459884643555 seconds Wrote graph to binary file in 0.329129695892334 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.879739 Louvain completed 21 runs in 1.3448681831359863 seconds PhenoGraph complete in 2.2883236408233643 seconds Found communities [-1, ... 16], with sizes: [160, 283, 200, 118, 108, 100, 69, 61, 46, 41, 36, 31, 29, 25, 24, 23, 21, 17] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2129228115081787 seconds Jaccard graph constructed in 0.4608762264251709 seconds Wrote graph to binary file in 0.026796817779541016 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882111 After 2 runs, maximum modularity is Q = 0.883529 After 3 runs, maximum modularity is Q = 0.884916 Louvain completed 23 runs in 1.8575758934020996 seconds PhenoGraph complete in 2.567722797393799 seconds Found communities [-1, ... 17], with sizes: [142, 267, 165, 116, 92, 88, 75, 71, 63, 61, 60, 56, 26, 26, 26, 15, 15, 15, 13] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10753631591796875 seconds Jaccard graph constructed in 0.41381263732910156 seconds Wrote graph to binary file in 0.04300236701965332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885981 After 2 runs, maximum modularity is Q = 0.887036 Louvain completed 22 runs in 1.5978055000305176 seconds PhenoGraph complete in 2.1889121532440186 seconds Found communities [-1, ... 17], with sizes: [147, 278, 164, 110, 88, 88, 69, 59, 59, 55, 46, 41, 38, 32, 32, 27, 25, 21, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11262702941894531 seconds Jaccard graph constructed in 0.4353642463684082 seconds Wrote graph to binary file in 0.06667590141296387 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883925 Louvain completed 21 runs in 1.3180835247039795 seconds PhenoGraph complete in 1.943319320678711 seconds Found communities [-1, ... 17], with sizes: [150, 259, 187, 121, 91, 87, 86, 62, 58, 46, 45, 35, 33, 27, 24, 23, 20, 19, 19] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11774826049804688 seconds Jaccard graph constructed in 0.4601624011993408 seconds Wrote graph to binary file in 0.029280662536621094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89027 Louvain completed 21 runs in 1.2568957805633545 seconds PhenoGraph complete in 1.873772144317627 seconds Found communities [-1, ... 19], with sizes: [164, 242, 191, 87, 83, 83, 71, 66, 61, 55, 52, 48, 36, 29, 24, 22, 20, 18, 15, 14, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10627269744873047 seconds Jaccard graph constructed in 0.48020219802856445 seconds Wrote graph to binary file in 0.030440092086791992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890978 Louvain completed 21 runs in 1.3140740394592285 seconds PhenoGraph complete in 1.944899082183838 seconds Found communities [-1, ... 18], with sizes: [155, 256, 228, 90, 86, 78, 72, 67, 60, 52, 48, 39, 32, 29, 21, 20, 18, 16, 14, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21527576446533203 seconds Jaccard graph constructed in 0.40128302574157715 seconds Wrote graph to binary file in 0.04050946235656738 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885462 Louvain completed 21 runs in 1.3291218280792236 seconds PhenoGraph complete in 2.0129151344299316 seconds Found communities [-1, ... 19], with sizes: [150, 246, 200, 95, 82, 74, 70, 60, 58, 57, 50, 41, 35, 32, 30, 28, 21, 21, 15, 14, 13] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11005902290344238 seconds Jaccard graph constructed in 0.482438325881958 seconds Wrote graph to binary file in 0.029459238052368164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885172 Louvain completed 21 runs in 1.3148517608642578 seconds PhenoGraph complete in 1.9467573165893555 seconds Found communities [-1, ... 17], with sizes: [174, 274, 194, 92, 76, 75, 72, 66, 64, 59, 57, 41, 31, 27, 25, 21, 15, 15, 14] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2090897560119629 seconds Jaccard graph constructed in 0.4054684638977051 seconds Wrote graph to binary file in 0.05972576141357422 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884556 Louvain completed 21 runs in 1.3259060382843018 seconds PhenoGraph complete in 2.0101284980773926 seconds Found communities [-1, ... 19], with sizes: [157, 268, 196, 98, 76, 65, 65, 56, 56, 48, 43, 35, 34, 33, 31, 30, 29, 21, 20, 16, 15] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11155295372009277 seconds Jaccard graph constructed in 0.39594268798828125 seconds Wrote graph to binary file in 0.28209710121154785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886023 Louvain completed 21 runs in 1.323824167251587 seconds PhenoGraph complete in 2.1238250732421875 seconds Found communities [-1, ... 17], with sizes: [169, 251, 155, 119, 90, 89, 85, 84, 58, 57, 49, 33, 30, 28, 23, 22, 21, 16, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11206221580505371 seconds Jaccard graph constructed in 0.45722389221191406 seconds Wrote graph to binary file in 0.028032779693603516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.881474 After 5 runs, maximum modularity is Q = 0.882949 Louvain completed 25 runs in 1.755934715270996 seconds PhenoGraph complete in 2.3644959926605225 seconds Found communities [-1, ... 16], with sizes: [145, 285, 189, 111, 93, 89, 73, 63, 60, 56, 47, 41, 31, 29, 25, 21, 18, 16] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11324906349182129 seconds Jaccard graph constructed in 0.40363240242004395 seconds Wrote graph to binary file in 0.050939321517944336 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886927 Louvain completed 21 runs in 1.3983840942382812 seconds PhenoGraph complete in 1.9748764038085938 seconds Found communities [-1, ... 16], with sizes: [183, 275, 208, 122, 62, 62, 57, 56, 54, 52, 47, 42, 34, 31, 29, 29, 27, 22] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11382365226745605 seconds Jaccard graph constructed in 0.46040773391723633 seconds Wrote graph to binary file in 0.027059555053710938 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882505 After 2 runs, maximum modularity is Q = 0.885266 Louvain completed 22 runs in 1.598015546798706 seconds PhenoGraph complete in 2.2085375785827637 seconds Found communities [-1, ... 17], with sizes: [166, 271, 208, 112, 86, 84, 65, 59, 58, 42, 40, 37, 29, 29, 29, 24, 24, 15, 14] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11151361465454102 seconds Jaccard graph constructed in 0.5021193027496338 seconds Wrote graph to binary file in 0.031245946884155273 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891541 Louvain completed 21 runs in 1.3343415260314941 seconds PhenoGraph complete in 1.991173267364502 seconds Found communities [-1, ... 19], with sizes: [140, 249, 172, 103, 86, 80, 59, 58, 54, 52, 51, 44, 43, 33, 31, 31, 30, 24, 21, 20, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.113037109375 seconds Jaccard graph constructed in 0.45372676849365234 seconds Wrote graph to binary file in 0.026300907135009766 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883527 Louvain completed 21 runs in 1.3329782485961914 seconds PhenoGraph complete in 1.9335401058197021 seconds Found communities [-1, ... 18], with sizes: [178, 198, 186, 93, 89, 82, 75, 66, 64, 61, 57, 55, 39, 29, 27, 24, 23, 17, 16, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1065073013305664 seconds Jaccard graph constructed in 0.4500918388366699 seconds Wrote graph to binary file in 0.04378342628479004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885788 Louvain completed 21 runs in 1.5246539115905762 seconds PhenoGraph complete in 2.1452653408050537 seconds Found communities [-1, ... 20], with sizes: [168, 240, 168, 104, 101, 84, 81, 58, 53, 53, 47, 33, 30, 30, 29, 23, 19, 17, 17, 13, 13, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10833430290222168 seconds Jaccard graph constructed in 0.40520429611206055 seconds Wrote graph to binary file in 0.040392160415649414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886998 Louvain completed 21 runs in 1.3182458877563477 seconds PhenoGraph complete in 1.8945374488830566 seconds Found communities [-1, ... 17], with sizes: [145, 265, 132, 92, 85, 82, 79, 70, 61, 59, 52, 46, 44, 38, 33, 32, 32, 23, 22] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1098332405090332 seconds Jaccard graph constructed in 0.41411828994750977 seconds Wrote graph to binary file in 0.03911447525024414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888591 Louvain completed 21 runs in 1.5329885482788086 seconds PhenoGraph complete in 2.1093666553497314 seconds Found communities [-1, ... 18], with sizes: [166, 233, 175, 85, 83, 75, 75, 73, 60, 54, 48, 41, 40, 38, 31, 30, 28, 22, 22, 13] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10961723327636719 seconds Jaccard graph constructed in 0.5002346038818359 seconds Wrote graph to binary file in 0.033417463302612305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886099 Louvain completed 21 runs in 1.5221836566925049 seconds PhenoGraph complete in 2.1778619289398193 seconds Found communities [-1, ... 17], with sizes: [160, 247, 189, 98, 91, 85, 75, 60, 55, 50, 49, 41, 37, 35, 32, 30, 24, 20, 14] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10643386840820312 seconds Jaccard graph constructed in 0.40942883491516113 seconds Wrote graph to binary file in 0.2884066104888916 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884044 After 5 runs, maximum modularity is Q = 0.885202 Louvain completed 25 runs in 1.7502760887145996 seconds PhenoGraph complete in 2.568711519241333 seconds Found communities [-1, ... 16], with sizes: [161, 255, 229, 114, 81, 70, 65, 61, 57, 53, 44, 40, 35, 33, 27, 24, 24, 19] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1112668514251709 seconds Jaccard graph constructed in 0.45477819442749023 seconds Wrote graph to binary file in 0.028890132904052734 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886769 After 9 runs, maximum modularity is Q = 0.887841 Louvain completed 29 runs in 1.9556076526641846 seconds PhenoGraph complete in 2.5612595081329346 seconds Found communities [-1, ... 19], with sizes: [147, 276, 154, 86, 82, 71, 59, 58, 55, 53, 50, 44, 43, 38, 36, 31, 28, 27, 25, 15, 14] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11242341995239258 seconds Jaccard graph constructed in 0.3963158130645752 seconds Wrote graph to binary file in 0.06721639633178711 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885387 Louvain completed 21 runs in 1.337975025177002 seconds PhenoGraph complete in 1.9228894710540771 seconds Found communities [-1, ... 20], with sizes: [162, 251, 196, 84, 82, 62, 57, 55, 54, 50, 48, 48, 46, 34, 32, 31, 22, 20, 20, 16, 11, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11513066291809082 seconds Jaccard graph constructed in 0.48522496223449707 seconds Wrote graph to binary file in 0.028902292251586914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890073 Louvain completed 21 runs in 1.5114171504974365 seconds PhenoGraph complete in 2.151106119155884 seconds Found communities [-1, ... 15], with sizes: [150, 289, 197, 98, 94, 91, 89, 59, 55, 55, 45, 39, 30, 28, 27, 24, 22]
sc.pp.normalize_per_cell(D326_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Brus_Dis1) # log transform the data
D326_Brus_Dis1.raw = D326_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D326_Brus_Dis1 = D326_Brus_Dis1[:, D326_Brus_Dis1.var['ribo_genes']]
D326_Brus_Dis1
View of AnnData object with n_obs × n_vars = 1114 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D337_Brus_Dis1 = sc.read_10x_mtx(
'./D337_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D337_Brus_Dis1.var_names_make_unique()
D337_Brus_Dis1.obs['manip'] = 'D337_Brus_Dis1'
D337_Brus_Dis1.obs['position'] = 'Distal'
D337_Brus_Dis1.obs['method'] = 'Brushing'
D337_Brus_Dis1.obs['donor'] = 'D337'
D337_Brus_Dis1.obs['name'] = ['D337_Brus_Dis1' + s for s in list(D337_Brus_Dis1.obs.index)]
D337_Brus_Dis1.obs_names = D337_Brus_Dis1.obs['name']
D337_Brus_Dis1
... reading from cache file ./cache/D337_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1428 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D337_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=0)
mito_genes = D337_Brus_Dis1.var_names.str.startswith('MT-')
D337_Brus_Dis1.obs['percent_mito'] = np.sum(
D337_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.obs['n_counts'] = D337_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D337_Brus_Dis1.to_df())
ribo_genes = D337_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D337_Brus_Dis1.obs['percent_ribo'] = np.sum(
D337_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D337_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=500)
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['n_counts'] < 30000, :]
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 17 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D337_Brus_Dis1.X, expected_doublet_rate=0.012)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D337_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D337_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.12 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 40.2% Overall doublet rate: Expected = 1.2% Estimated = 1.1% Elapsed time: 1.0 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea294a2b0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea13a24a8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D337_Brus_Dis1.X).predict()
D337_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21335673332214355 seconds Jaccard graph constructed in 0.4331936836242676 seconds Wrote graph to binary file in 0.04625391960144043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889309 Louvain completed 21 runs in 1.4623045921325684 seconds PhenoGraph complete in 2.1702306270599365 seconds Found communities [-1, ... 17], with sizes: [280, 286, 234, 125, 122, 108, 95, 82, 73, 67, 53, 41, 33, 33, 31, 25, 24, 20, 15] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21367979049682617 seconds Jaccard graph constructed in 0.46930909156799316 seconds Wrote graph to binary file in 0.3069896697998047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890979 After 8 runs, maximum modularity is Q = 0.892065 Louvain completed 28 runs in 2.011324167251587 seconds PhenoGraph complete in 3.012248992919922 seconds Found communities [-1, ... 16], with sizes: [272, 315, 293, 163, 132, 85, 83, 69, 65, 53, 49, 31, 30, 30, 24, 23, 19, 11] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21150565147399902 seconds Jaccard graph constructed in 0.5038797855377197 seconds Wrote graph to binary file in 0.040128469467163086 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891754 Louvain completed 21 runs in 1.4095039367675781 seconds PhenoGraph complete in 2.1780807971954346 seconds Found communities [-1, ... 17], with sizes: [240, 351, 215, 128, 119, 100, 92, 72, 71, 67, 54, 53, 34, 32, 31, 26, 24, 22, 16] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21395373344421387 seconds Jaccard graph constructed in 0.46888303756713867 seconds Wrote graph to binary file in 0.03740644454956055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886129 After 15 runs, maximum modularity is Q = 0.887213 Louvain completed 35 runs in 2.362523078918457 seconds PhenoGraph complete in 3.094099283218384 seconds Found communities [-1, ... 14], with sizes: [327, 292, 274, 160, 136, 99, 85, 70, 67, 54, 48, 38, 29, 25, 24, 19] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21154189109802246 seconds Jaccard graph constructed in 0.4939708709716797 seconds Wrote graph to binary file in 0.03615260124206543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891106 After 2 runs, maximum modularity is Q = 0.89315 Louvain completed 22 runs in 1.6358544826507568 seconds PhenoGraph complete in 2.3885648250579834 seconds Found communities [-1, ... 17], with sizes: [301, 306, 264, 126, 98, 87, 78, 72, 67, 63, 61, 50, 47, 35, 29, 25, 16, 11, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2123885154724121 seconds Jaccard graph constructed in 0.5010015964508057 seconds Wrote graph to binary file in 0.0345149040222168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88907 Louvain completed 21 runs in 1.3986222743988037 seconds PhenoGraph complete in 2.1557860374450684 seconds Found communities [-1, ... 16], with sizes: [303, 328, 264, 157, 139, 76, 70, 60, 55, 52, 50, 47, 30, 30, 28, 21, 19, 18] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21259760856628418 seconds Jaccard graph constructed in 0.48282337188720703 seconds Wrote graph to binary file in 0.036559104919433594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888333 After 3 runs, maximum modularity is Q = 0.890067 Louvain completed 23 runs in 1.7427825927734375 seconds PhenoGraph complete in 2.4860239028930664 seconds Found communities [-1, ... 16], with sizes: [291, 341, 270, 150, 132, 86, 79, 72, 68, 59, 54, 28, 25, 23, 23, 18, 16, 12] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21264147758483887 seconds Jaccard graph constructed in 0.48983144760131836 seconds Wrote graph to binary file in 0.03361320495605469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889108 After 3 runs, maximum modularity is Q = 0.890511 Louvain completed 23 runs in 1.7381327152252197 seconds PhenoGraph complete in 2.4849116802215576 seconds Found communities [-1, ... 14], with sizes: [347, 313, 280, 136, 123, 104, 86, 65, 57, 53, 48, 33, 30, 29, 28, 15] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21393513679504395 seconds Jaccard graph constructed in 0.471268892288208 seconds Wrote graph to binary file in 0.03370261192321777 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88863 Louvain completed 21 runs in 1.3928608894348145 seconds PhenoGraph complete in 2.1209239959716797 seconds Found communities [-1, ... 16], with sizes: [293, 425, 192, 165, 123, 80, 67, 66, 59, 50, 43, 41, 32, 30, 30, 25, 14, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2122182846069336 seconds Jaccard graph constructed in 0.48700690269470215 seconds Wrote graph to binary file in 0.2865562438964844 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891116 Louvain completed 21 runs in 1.3843319416046143 seconds PhenoGraph complete in 2.383152484893799 seconds Found communities [-1, ... 17], with sizes: [273, 318, 285, 131, 105, 95, 71, 66, 66, 56, 50, 47, 32, 32, 31, 28, 24, 22, 15] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21279048919677734 seconds Jaccard graph constructed in 0.47730088233947754 seconds Wrote graph to binary file in 0.033003807067871094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888836 After 4 runs, maximum modularity is Q = 0.89014 Louvain completed 24 runs in 1.8066763877868652 seconds PhenoGraph complete in 2.5400307178497314 seconds Found communities [-1, ... 14], with sizes: [311, 365, 263, 144, 134, 84, 67, 61, 60, 58, 49, 35, 35, 32, 28, 21] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21146702766418457 seconds Jaccard graph constructed in 0.48106908798217773 seconds Wrote graph to binary file in 0.034003257751464844 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887739 After 2 runs, maximum modularity is Q = 0.889539 Louvain completed 22 runs in 1.6745891571044922 seconds PhenoGraph complete in 2.4130375385284424 seconds Found communities [-1, ... 14], with sizes: [284, 320, 301, 167, 143, 90, 87, 68, 61, 51, 45, 36, 29, 27, 24, 14] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2134096622467041 seconds Jaccard graph constructed in 0.47774243354797363 seconds Wrote graph to binary file in 0.036612749099731445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891666 Louvain completed 21 runs in 1.3963208198547363 seconds PhenoGraph complete in 2.136085271835327 seconds Found communities [-1, ... 18], with sizes: [269, 349, 248, 150, 129, 89, 76, 70, 63, 61, 55, 36, 29, 25, 24, 19, 19, 13, 12, 11] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20694851875305176 seconds Jaccard graph constructed in 0.4779665470123291 seconds Wrote graph to binary file in 0.033194541931152344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.89278 Louvain completed 21 runs in 1.380201816558838 seconds PhenoGraph complete in 2.10831618309021 seconds Found communities [-1, ... 14], with sizes: [304, 320, 281, 139, 134, 91, 79, 75, 62, 55, 49, 35, 32, 32, 31, 28] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21299409866333008 seconds Jaccard graph constructed in 0.4699113368988037 seconds Wrote graph to binary file in 0.036470890045166016 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890938 Louvain completed 21 runs in 1.3843612670898438 seconds PhenoGraph complete in 2.114614963531494 seconds Found communities [-1, ... 15], with sizes: [325, 353, 235, 160, 134, 92, 68, 68, 65, 54, 48, 34, 29, 25, 21, 20, 16] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21335363388061523 seconds Jaccard graph constructed in 0.4796717166900635 seconds Wrote graph to binary file in 0.033203840255737305 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890016 After 3 runs, maximum modularity is Q = 0.891483 Louvain completed 23 runs in 1.7361114025115967 seconds PhenoGraph complete in 2.475517511367798 seconds Found communities [-1, ... 17], with sizes: [293, 330, 181, 162, 129, 98, 92, 87, 67, 55, 54, 51, 33, 28, 23, 20, 19, 13, 12] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2094576358795166 seconds Jaccard graph constructed in 0.4783966541290283 seconds Wrote graph to binary file in 0.034241676330566406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891641 Louvain completed 21 runs in 1.3947453498840332 seconds PhenoGraph complete in 2.1288998126983643 seconds Found communities [-1, ... 15], with sizes: [345, 325, 259, 140, 128, 85, 70, 68, 67, 63, 48, 30, 29, 28, 24, 23, 15] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21103763580322266 seconds Jaccard graph constructed in 0.4701263904571533 seconds Wrote graph to binary file in 0.2873952388763428 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889394 After 8 runs, maximum modularity is Q = 0.890548 Louvain completed 28 runs in 2.0043752193450928 seconds PhenoGraph complete in 2.982663631439209 seconds Found communities [-1, ... 17], with sizes: [312, 303, 265, 157, 132, 97, 69, 62, 62, 54, 50, 32, 31, 24, 24, 24, 20, 17, 12] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21220898628234863 seconds Jaccard graph constructed in 0.4945716857910156 seconds Wrote graph to binary file in 0.033548831939697266 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886742 After 5 runs, maximum modularity is Q = 0.887786 Louvain completed 25 runs in 1.8308155536651611 seconds PhenoGraph complete in 2.5825717449188232 seconds Found communities [-1, ... 15], with sizes: [296, 406, 224, 145, 131, 82, 77, 75, 69, 65, 54, 29, 26, 23, 19, 14, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21071267127990723 seconds Jaccard graph constructed in 0.49286913871765137 seconds Wrote graph to binary file in 0.033945322036743164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890355 Louvain completed 21 runs in 1.3779034614562988 seconds PhenoGraph complete in 2.127120018005371 seconds Found communities [-1, ... 14], with sizes: [304, 345, 274, 157, 135, 80, 79, 70, 64, 50, 49, 30, 29, 28, 28, 25] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2136213779449463 seconds Jaccard graph constructed in 0.4851834774017334 seconds Wrote graph to binary file in 0.03219246864318848 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888702 Louvain completed 21 runs in 1.4015047550201416 seconds PhenoGraph complete in 2.143026828765869 seconds Found communities [-1, ... 14], with sizes: [321, 381, 225, 154, 137, 74, 72, 65, 59, 55, 53, 33, 32, 32, 31, 23] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21358609199523926 seconds Jaccard graph constructed in 0.47631025314331055 seconds Wrote graph to binary file in 0.03299760818481445 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893192 Louvain completed 21 runs in 1.3891570568084717 seconds PhenoGraph complete in 2.123581886291504 seconds Found communities [-1, ... 15], with sizes: [265, 341, 308, 165, 128, 97, 74, 71, 64, 49, 45, 30, 29, 26, 22, 21, 12] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2122657299041748 seconds Jaccard graph constructed in 0.4299893379211426 seconds Wrote graph to binary file in 0.04511857032775879 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88642 Louvain completed 21 runs in 1.3999876976013184 seconds PhenoGraph complete in 2.127727746963501 seconds Found communities [-1, ... 17], with sizes: [282, 355, 248, 129, 128, 100, 89, 80, 64, 54, 49, 30, 30, 28, 24, 18, 16, 12, 11] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21165895462036133 seconds Jaccard graph constructed in 0.4974677562713623 seconds Wrote graph to binary file in 0.0336461067199707 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891933 Louvain completed 21 runs in 1.4018588066101074 seconds PhenoGraph complete in 2.156179904937744 seconds Found communities [-1, ... 17], with sizes: [287, 309, 291, 160, 128, 89, 77, 72, 63, 58, 49, 32, 28, 24, 20, 18, 15, 14, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21214962005615234 seconds Jaccard graph constructed in 0.47988247871398926 seconds Wrote graph to binary file in 0.2532627582550049 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888439 Louvain completed 21 runs in 1.398589849472046 seconds PhenoGraph complete in 2.357525587081909 seconds Found communities [-1, ... 17], with sizes: [289, 371, 247, 132, 120, 101, 92, 87, 59, 48, 35, 32, 29, 25, 24, 18, 13, 13, 12]
sc.pp.normalize_per_cell(D337_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D337_Brus_Dis1) # log transform the data
D337_Brus_Dis1.raw = D337_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D337_Brus_Dis1 = D337_Brus_Dis1[:, D337_Brus_Dis1.var['ribo_genes']]
D337_Brus_Dis1
View of AnnData object with n_obs × n_vars = 1398 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D339_Brus_Dis1 = sc.read_10x_mtx(
'./D339_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D339_Brus_Dis1.var_names_make_unique()
D339_Brus_Dis1.obs['manip'] = 'D339_Brus_Dis1'
D339_Brus_Dis1.obs['position'] = 'Distal'
D339_Brus_Dis1.obs['method'] = 'Brushing'
D339_Brus_Dis1.obs['donor'] = 'D339'
D339_Brus_Dis1.obs['name'] = ['D339_Brus_Dis1_' + s for s in list(D339_Brus_Dis1.obs.index)]
D339_Brus_Dis1.obs_names = D339_Brus_Dis1.obs['name']
D339_Brus_Dis1
... reading from cache file ./cache/D339_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1382 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D339_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=0)
mito_genes = D339_Brus_Dis1.var_names.str.startswith('MT-')
D339_Brus_Dis1.obs['percent_mito'] = np.sum(
D339_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.obs['n_counts'] = D339_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Brus_Dis1.to_df())
ribo_genes = D339_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D339_Brus_Dis1.obs['percent_ribo'] = np.sum(
D339_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D339_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=500)
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['n_counts'] < 15000, :]
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 35 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D339_Brus_Dis1.X, expected_doublet_rate=0.012)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D339_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.12 Detected doublet rate = 0.4% Estimated detectable doublet fraction = 31.3% Overall doublet rate: Expected = 1.2% Estimated = 1.4% Elapsed time: 0.7 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b3f7b8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b73ba8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Brus_Dis1.X).predict()
D339_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11077642440795898 seconds Jaccard graph constructed in 0.46425771713256836 seconds Wrote graph to binary file in 0.029248476028442383 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885009 Louvain completed 21 runs in 1.3841643333435059 seconds PhenoGraph complete in 2.0007967948913574 seconds Found communities [-1, ... 21], with sizes: [160, 162, 153, 140, 111, 95, 91, 81, 77, 76, 76, 69, 69, 65, 53, 33, 32, 25, 23, 22, 21, 18, 16] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11488533020019531 seconds Jaccard graph constructed in 0.41995882987976074 seconds Wrote graph to binary file in 0.034644365310668945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888411 Louvain completed 21 runs in 1.3784821033477783 seconds PhenoGraph complete in 1.9651494026184082 seconds Found communities [-1, ... 20], with sizes: [160, 173, 149, 143, 98, 94, 87, 80, 78, 77, 77, 76, 71, 71, 47, 41, 36, 27, 25, 21, 20, 17] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.210890531539917 seconds Jaccard graph constructed in 0.4322068691253662 seconds Wrote graph to binary file in 0.042722463607788086 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883775 Louvain completed 21 runs in 1.4371147155761719 seconds PhenoGraph complete in 2.1498568058013916 seconds Found communities [-1, ... 21], with sizes: [117, 259, 167, 152, 84, 80, 79, 77, 77, 76, 75, 70, 63, 42, 42, 34, 33, 33, 24, 24, 24, 20, 16] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10892605781555176 seconds Jaccard graph constructed in 0.40697622299194336 seconds Wrote graph to binary file in 0.03625774383544922 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885898 After 9 runs, maximum modularity is Q = 0.887208 Louvain completed 29 runs in 2.031949520111084 seconds PhenoGraph complete in 2.606541156768799 seconds Found communities [-1, ... 21], with sizes: [175, 204, 167, 157, 107, 89, 85, 84, 70, 68, 65, 62, 47, 43, 42, 37, 33, 32, 27, 26, 20, 15, 13] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11081123352050781 seconds Jaccard graph constructed in 0.4158029556274414 seconds Wrote graph to binary file in 0.3278360366821289 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.889271 Louvain completed 21 runs in 1.393808126449585 seconds PhenoGraph complete in 2.260817050933838 seconds Found communities [-1, ... 21], with sizes: [152, 259, 181, 146, 99, 91, 79, 76, 71, 67, 63, 49, 47, 47, 46, 46, 37, 23, 22, 22, 16, 16, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11050081253051758 seconds Jaccard graph constructed in 0.5362215042114258 seconds Wrote graph to binary file in 0.044724464416503906 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883253 Louvain completed 21 runs in 1.5084295272827148 seconds PhenoGraph complete in 2.21579909324646 seconds Found communities [-1, ... 21], with sizes: [150, 250, 162, 124, 120, 105, 94, 84, 76, 75, 64, 62, 57, 51, 37, 34, 27, 21, 17, 17, 14, 14, 13] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10633039474487305 seconds Jaccard graph constructed in 0.43742847442626953 seconds Wrote graph to binary file in 0.05077648162841797 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883893 After 7 runs, maximum modularity is Q = 0.885032 Louvain completed 27 runs in 1.9285502433776855 seconds PhenoGraph complete in 2.541843891143799 seconds Found communities [-1, ... 20], with sizes: [130, 250, 162, 115, 113, 105, 90, 88, 86, 86, 78, 68, 66, 53, 45, 30, 21, 20, 18, 16, 15, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10640215873718262 seconds Jaccard graph constructed in 0.4886972904205322 seconds Wrote graph to binary file in 0.04035234451293945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885462 Louvain completed 21 runs in 1.389136552810669 seconds PhenoGraph complete in 2.035393714904785 seconds Found communities [-1, ... 23], with sizes: [140, 176, 152, 138, 115, 97, 94, 91, 77, 77, 76, 76, 72, 37, 37, 37, 30, 25, 22, 20, 17, 17, 16, 16, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11144781112670898 seconds Jaccard graph constructed in 0.47895145416259766 seconds Wrote graph to binary file in 0.03691458702087402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887619 After 3 runs, maximum modularity is Q = 0.888856 Louvain completed 23 runs in 1.7179548740386963 seconds PhenoGraph complete in 2.3571958541870117 seconds Found communities [-1, ... 21], with sizes: [133, 157, 134, 118, 104, 101, 94, 90, 89, 88, 86, 85, 85, 69, 56, 39, 29, 25, 22, 19, 15, 15, 15] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21216130256652832 seconds Jaccard graph constructed in 0.4791691303253174 seconds Wrote graph to binary file in 0.03528738021850586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888537 Louvain completed 21 runs in 1.3970263004302979 seconds PhenoGraph complete in 2.1338586807250977 seconds Found communities [-1, ... 21], with sizes: [161, 187, 161, 129, 114, 110, 97, 86, 82, 80, 74, 71, 54, 47, 35, 32, 32, 23, 21, 20, 20, 17, 15] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11372590065002441 seconds Jaccard graph constructed in 0.5232908725738525 seconds Wrote graph to binary file in 0.04384946823120117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882984 Louvain completed 21 runs in 1.6067731380462646 seconds PhenoGraph complete in 2.300321578979492 seconds Found communities [-1, ... 21], with sizes: [143, 175, 166, 154, 108, 97, 88, 86, 84, 81, 75, 72, 65, 59, 41, 40, 31, 21, 20, 18, 18, 13, 13] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1136023998260498 seconds Jaccard graph constructed in 0.495316743850708 seconds Wrote graph to binary file in 0.3014347553253174 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885818 Louvain completed 21 runs in 1.4269380569458008 seconds PhenoGraph complete in 2.3467276096343994 seconds Found communities [-1, ... 22], with sizes: [142, 184, 182, 147, 101, 92, 85, 84, 78, 77, 76, 75, 55, 51, 46, 33, 29, 24, 21, 21, 19, 17, 16, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11040973663330078 seconds Jaccard graph constructed in 0.5481421947479248 seconds Wrote graph to binary file in 0.04265451431274414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883822 Louvain completed 21 runs in 1.455559253692627 seconds PhenoGraph complete in 2.169980525970459 seconds Found communities [-1, ... 19], with sizes: [151, 179, 157, 138, 102, 99, 92, 92, 90, 88, 77, 69, 68, 59, 46, 42, 38, 27, 21, 17, 16] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11211466789245605 seconds Jaccard graph constructed in 0.5048460960388184 seconds Wrote graph to binary file in 0.03805422782897949 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884942 Louvain completed 21 runs in 1.3988964557647705 seconds PhenoGraph complete in 2.074927806854248 seconds Found communities [-1, ... 19], with sizes: [146, 164, 151, 110, 106, 105, 93, 91, 88, 84, 84, 79, 77, 74, 56, 42, 36, 23, 23, 22, 14] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10606861114501953 seconds Jaccard graph constructed in 0.44935131072998047 seconds Wrote graph to binary file in 0.05611062049865723 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882788 After 2 runs, maximum modularity is Q = 0.884078 Louvain completed 22 runs in 1.7772152423858643 seconds PhenoGraph complete in 2.406134843826294 seconds Found communities [-1, ... 21], with sizes: [127, 161, 130, 122, 110, 106, 94, 91, 90, 77, 76, 71, 68, 67, 63, 62, 36, 23, 22, 21, 20, 17, 14] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1152658462524414 seconds Jaccard graph constructed in 0.535527229309082 seconds Wrote graph to binary file in 0.04455399513244629 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.882038 After 2 runs, maximum modularity is Q = 0.883485 Louvain completed 22 runs in 1.9561593532562256 seconds PhenoGraph complete in 2.664527654647827 seconds Found communities [-1, ... 21], with sizes: [148, 173, 155, 141, 128, 120, 96, 82, 81, 81, 77, 72, 71, 52, 45, 30, 24, 19, 18, 15, 14, 13, 13] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10978841781616211 seconds Jaccard graph constructed in 0.48043322563171387 seconds Wrote graph to binary file in 0.04678463935852051 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890257 Louvain completed 21 runs in 1.4102518558502197 seconds PhenoGraph complete in 2.0746712684631348 seconds Found communities [-1, ... 22], with sizes: [161, 181, 149, 108, 107, 100, 94, 90, 84, 75, 73, 63, 60, 56, 51, 42, 41, 25, 24, 22, 18, 16, 15, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11337614059448242 seconds Jaccard graph constructed in 0.4776909351348877 seconds Wrote graph to binary file in 0.03323507308959961 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883034 After 4 runs, maximum modularity is Q = 0.884892 Louvain completed 24 runs in 1.7922523021697998 seconds PhenoGraph complete in 2.4263017177581787 seconds Found communities [-1, ... 19], with sizes: [140, 165, 129, 124, 108, 102, 98, 91, 90, 86, 84, 81, 79, 77, 75, 36, 27, 25, 18, 17, 16] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11046648025512695 seconds Jaccard graph constructed in 0.5300097465515137 seconds Wrote graph to binary file in 0.034813880920410156 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886437 After 3 runs, maximum modularity is Q = 0.887873 Louvain completed 23 runs in 1.7248668670654297 seconds PhenoGraph complete in 2.4114723205566406 seconds Found communities [-1, ... 21], with sizes: [174, 165, 154, 135, 104, 93, 87, 84, 84, 78, 76, 67, 64, 55, 46, 37, 36, 34, 26, 25, 18, 14, 12] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11335539817810059 seconds Jaccard graph constructed in 0.4874453544616699 seconds Wrote graph to binary file in 0.29427146911621094 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883389 After 2 runs, maximum modularity is Q = 0.884773 Louvain completed 22 runs in 1.6713039875030518 seconds PhenoGraph complete in 2.5767195224761963 seconds Found communities [-1, ... 21], with sizes: [119, 241, 170, 154, 98, 93, 83, 81, 74, 73, 68, 65, 62, 46, 43, 34, 31, 29, 27, 21, 20, 18, 18] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1126549243927002 seconds Jaccard graph constructed in 0.5239880084991455 seconds Wrote graph to binary file in 0.03842663764953613 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.883117 After 2 runs, maximum modularity is Q = 0.885533 Louvain completed 22 runs in 1.967371940612793 seconds PhenoGraph complete in 2.654338836669922 seconds Found communities [-1, ... 18], with sizes: [159, 254, 162, 155, 125, 106, 97, 90, 83, 72, 72, 55, 47, 44, 30, 28, 27, 23, 22, 17] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.10812759399414062 seconds Jaccard graph constructed in 0.4925107955932617 seconds Wrote graph to binary file in 0.036156415939331055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.881369 After 3 runs, maximum modularity is Q = 0.88393 Louvain completed 23 runs in 1.7236568927764893 seconds PhenoGraph complete in 2.3699145317077637 seconds Found communities [-1, ... 21], with sizes: [135, 160, 159, 156, 103, 88, 88, 82, 80, 78, 77, 76, 70, 67, 47, 44, 32, 31, 22, 19, 19, 18, 17] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11336946487426758 seconds Jaccard graph constructed in 0.47954440116882324 seconds Wrote graph to binary file in 0.0342555046081543 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885208 After 4 runs, maximum modularity is Q = 0.886291 Louvain completed 24 runs in 1.7783267498016357 seconds PhenoGraph complete in 2.415379285812378 seconds Found communities [-1, ... 20], with sizes: [139, 266, 179, 97, 93, 92, 90, 82, 80, 74, 73, 68, 60, 50, 49, 43, 38, 25, 22, 18, 16, 14] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11088943481445312 seconds Jaccard graph constructed in 0.488300085067749 seconds Wrote graph to binary file in 0.03625321388244629 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884459 After 4 runs, maximum modularity is Q = 0.885983 After 5 runs, maximum modularity is Q = 0.88705 Louvain completed 25 runs in 2.0681710243225098 seconds PhenoGraph complete in 2.7141079902648926 seconds Found communities [-1, ... 21], with sizes: [151, 248, 155, 111, 111, 89, 83, 76, 73, 68, 65, 63, 60, 56, 53, 44, 33, 32, 31, 23, 15, 15, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21431660652160645 seconds Jaccard graph constructed in 0.5337605476379395 seconds Wrote graph to binary file in 0.04564046859741211 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884847 Louvain completed 21 runs in 1.6112499237060547 seconds PhenoGraph complete in 2.419363021850586 seconds Found communities [-1, ... 21], with sizes: [117, 161, 140, 140, 102, 97, 88, 84, 81, 81, 79, 79, 76, 74, 70, 37, 36, 26, 26, 24, 20, 18, 12]
sc.pp.normalize_per_cell(D339_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Brus_Dis1) # log transform the data
D339_Brus_Dis1.raw = D339_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D339_Brus_Dis1 = D339_Brus_Dis1[:, D339_Brus_Dis1.var['ribo_genes']]
D339_Brus_Dis1
View of AnnData object with n_obs × n_vars = 1335 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D344_Brus_Dis1 = sc.read_10x_mtx(
'./D344_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D344_Brus_Dis1.var_names_make_unique()
D344_Brus_Dis1.obs['manip'] = 'D344_Brus_Dis1'
D344_Brus_Dis1.obs['position'] = 'Distal'
D344_Brus_Dis1.obs['method'] = 'Brushing'
D344_Brus_Dis1.obs['donor'] = 'D344'
D344_Brus_Dis1.obs['name'] = ['D344_Brus_Dis1_' + s for s in list(D344_Brus_Dis1.obs.index)]
D344_Brus_Dis1.obs_names = D344_Brus_Dis1.obs['name']
D344_Brus_Dis1
... reading from cache file ./cache/D344_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2817 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D344_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=0)
mito_genes = D344_Brus_Dis1.var_names.str.startswith('MT-')
D344_Brus_Dis1.obs['percent_mito'] = np.sum(
D344_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.obs['n_counts'] = D344_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Brus_Dis1.to_df())
ribo_genes = D344_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D344_Brus_Dis1.obs['percent_ribo'] = np.sum(
D344_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D344_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=500)
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['n_counts'] < 30000, :]
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['percent_mito'] < 0.3 , :]
filtered out 8 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D344_Brus_Dis1.X, expected_doublet_rate=0.023)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D344_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.17 Detected doublet rate = 1.4% Estimated detectable doublet fraction = 42.4% Overall doublet rate: Expected = 2.3% Estimated = 3.4% Elapsed time: 2.5 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea282b630>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1389ac8>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Brus_Dis1.X).predict()
D344_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4087679386138916 seconds Jaccard graph constructed in 0.676398515701294 seconds Wrote graph to binary file in 0.06066608428955078 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908475 After 2 runs, maximum modularity is Q = 0.910893 Louvain completed 22 runs in 2.0190682411193848 seconds PhenoGraph complete in 3.1778438091278076 seconds Found communities [-1, ... 21], with sizes: [180, 658, 412, 412, 283, 264, 186, 150, 147, 114, 87, 81, 79, 79, 67, 54, 48, 44, 39, 33, 31, 26, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5136964321136475 seconds Jaccard graph constructed in 0.6911904811859131 seconds Wrote graph to binary file in 0.06196713447570801 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905674 Louvain completed 21 runs in 1.6694996356964111 seconds PhenoGraph complete in 2.9525020122528076 seconds Found communities [-1, ... 21], with sizes: [203, 654, 435, 407, 294, 235, 170, 140, 117, 116, 115, 80, 76, 73, 72, 64, 57, 47, 37, 35, 32, 16, 11] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5086314678192139 seconds Jaccard graph constructed in 0.6270010471343994 seconds Wrote graph to binary file in 0.32866406440734863 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909567 Louvain completed 21 runs in 1.693134069442749 seconds PhenoGraph complete in 3.1828861236572266 seconds Found communities [-1, ... 20], with sizes: [157, 426, 424, 374, 314, 257, 223, 190, 173, 165, 118, 98, 94, 91, 85, 71, 56, 53, 41, 36, 21, 19] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30912256240844727 seconds Jaccard graph constructed in 0.6146574020385742 seconds Wrote graph to binary file in 0.0640110969543457 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908697 Louvain completed 21 runs in 1.705700159072876 seconds PhenoGraph complete in 2.7090213298797607 seconds Found communities [-1, ... 21], with sizes: [139, 462, 425, 396, 252, 250, 213, 186, 148, 130, 130, 114, 76, 73, 72, 72, 71, 59, 55, 47, 46, 36, 34] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4136331081390381 seconds Jaccard graph constructed in 0.6199691295623779 seconds Wrote graph to binary file in 0.06575632095336914 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90764 After 2 runs, maximum modularity is Q = 0.909513 Louvain completed 22 runs in 2.0296332836151123 seconds PhenoGraph complete in 3.145488739013672 seconds Found communities [-1, ... 21], with sizes: [153, 560, 451, 392, 343, 298, 209, 130, 111, 102, 81, 80, 77, 75, 74, 71, 68, 66, 45, 35, 25, 20, 20] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30895423889160156 seconds Jaccard graph constructed in 0.7187116146087646 seconds Wrote graph to binary file in 0.30313754081726074 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909477 Louvain completed 21 runs in 1.780221939086914 seconds PhenoGraph complete in 3.1288363933563232 seconds Found communities [-1, ... 18], with sizes: [186, 642, 590, 286, 279, 222, 212, 205, 126, 111, 108, 86, 86, 73, 62, 60, 59, 37, 33, 23] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41129422187805176 seconds Jaccard graph constructed in 0.6924159526824951 seconds Wrote graph to binary file in 0.06309294700622559 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908917 Louvain completed 21 runs in 1.7447381019592285 seconds PhenoGraph complete in 2.9282872676849365 seconds Found communities [-1, ... 23], with sizes: [152, 664, 450, 371, 278, 228, 186, 144, 133, 112, 105, 94, 84, 78, 72, 63, 62, 51, 35, 35, 34, 19, 13, 12, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4090301990509033 seconds Jaccard graph constructed in 0.6362454891204834 seconds Wrote graph to binary file in 0.06520462036132812 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909113 Louvain completed 21 runs in 1.7199881076812744 seconds PhenoGraph complete in 2.8456263542175293 seconds Found communities [-1, ... 21], with sizes: [162, 542, 452, 399, 288, 267, 251, 210, 136, 110, 98, 91, 88, 81, 69, 64, 55, 34, 31, 21, 14, 12, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3094162940979004 seconds Jaccard graph constructed in 0.6429347991943359 seconds Wrote graph to binary file in 0.06117081642150879 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909833 Louvain completed 21 runs in 1.7047889232635498 seconds PhenoGraph complete in 2.7326271533966064 seconds Found communities [-1, ... 21], with sizes: [164, 479, 447, 398, 331, 298, 194, 151, 142, 140, 114, 96, 87, 86, 85, 63, 54, 46, 35, 29, 21, 15, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40952610969543457 seconds Jaccard graph constructed in 0.6572070121765137 seconds Wrote graph to binary file in 0.4156205654144287 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910152 After 13 runs, maximum modularity is Q = 0.911166 Louvain completed 33 runs in 2.755455493927002 seconds PhenoGraph complete in 4.256148815155029 seconds Found communities [-1, ... 22], with sizes: [193, 449, 415, 411, 292, 266, 209, 188, 129, 111, 106, 103, 94, 89, 73, 68, 65, 57, 51, 31, 31, 21, 18, 16] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4093315601348877 seconds Jaccard graph constructed in 0.6507017612457275 seconds Wrote graph to binary file in 0.06365847587585449 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910742 Louvain completed 21 runs in 1.6846542358398438 seconds PhenoGraph complete in 2.825324296951294 seconds Found communities [-1, ... 19], with sizes: [175, 591, 451, 425, 326, 291, 206, 119, 112, 110, 109, 75, 75, 73, 73, 69, 67, 57, 34, 33, 15] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30977797508239746 seconds Jaccard graph constructed in 0.6322612762451172 seconds Wrote graph to binary file in 0.06093192100524902 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91032 Louvain completed 21 runs in 1.7355291843414307 seconds PhenoGraph complete in 2.752293586730957 seconds Found communities [-1, ... 21], with sizes: [162, 440, 431, 420, 381, 274, 215, 162, 158, 129, 105, 94, 87, 71, 57, 55, 52, 48, 47, 36, 34, 15, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31106996536254883 seconds Jaccard graph constructed in 0.7185852527618408 seconds Wrote graph to binary file in 0.32609009742736816 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911352 After 12 runs, maximum modularity is Q = 0.912368 Louvain completed 32 runs in 2.7134311199188232 seconds PhenoGraph complete in 4.0834503173828125 seconds Found communities [-1, ... 22], with sizes: [192, 472, 411, 402, 356, 296, 205, 177, 112, 105, 91, 82, 80, 72, 68, 65, 60, 60, 35, 34, 32, 31, 24, 24] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40963220596313477 seconds Jaccard graph constructed in 0.7162106037139893 seconds Wrote graph to binary file in 0.06307792663574219 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905629 After 2 runs, maximum modularity is Q = 0.906655 Louvain completed 22 runs in 2.098421096801758 seconds PhenoGraph complete in 3.3031489849090576 seconds Found communities [-1, ... 19], with sizes: [185, 578, 432, 409, 301, 290, 279, 141, 117, 109, 101, 90, 79, 73, 73, 69, 51, 33, 32, 23, 21] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3199498653411865 seconds Jaccard graph constructed in 0.6934974193572998 seconds Wrote graph to binary file in 0.0614933967590332 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909638 Louvain completed 21 runs in 1.7370011806488037 seconds PhenoGraph complete in 2.8248116970062256 seconds Found communities [-1, ... 22], with sizes: [184, 626, 433, 398, 273, 198, 176, 169, 155, 116, 109, 81, 78, 76, 74, 66, 59, 48, 36, 34, 32, 23, 23, 19] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4096238613128662 seconds Jaccard graph constructed in 0.7094638347625732 seconds Wrote graph to binary file in 0.3516569137573242 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908799 Louvain completed 21 runs in 1.6793816089630127 seconds PhenoGraph complete in 3.1653077602386475 seconds Found communities [-1, ... 19], with sizes: [226, 633, 562, 275, 257, 233, 232, 196, 111, 110, 99, 93, 91, 84, 68, 57, 42, 35, 33, 27, 22] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30857133865356445 seconds Jaccard graph constructed in 0.6490199565887451 seconds Wrote graph to binary file in 0.0794672966003418 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909344 Louvain completed 21 runs in 1.7038657665252686 seconds PhenoGraph complete in 2.763385057449341 seconds Found communities [-1, ... 19], with sizes: [178, 659, 463, 394, 293, 233, 210, 179, 173, 104, 94, 78, 74, 74, 64, 52, 47, 45, 33, 28, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41222643852233887 seconds Jaccard graph constructed in 0.6229174137115479 seconds Wrote graph to binary file in 0.06175661087036133 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910231 Louvain completed 21 runs in 1.7293064594268799 seconds PhenoGraph complete in 2.8399322032928467 seconds Found communities [-1, ... 19], with sizes: [141, 624, 476, 443, 275, 212, 196, 194, 131, 115, 109, 84, 78, 67, 63, 62, 53, 52, 45, 33, 33] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3116340637207031 seconds Jaccard graph constructed in 0.6231060028076172 seconds Wrote graph to binary file in 0.2984890937805176 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906889 After 3 runs, maximum modularity is Q = 0.908115 After 21 runs, maximum modularity is Q = 0.909208 Louvain completed 41 runs in 3.6392505168914795 seconds PhenoGraph complete in 4.895208835601807 seconds Found communities [-1, ... 22], with sizes: [167, 534, 430, 366, 283, 277, 189, 160, 122, 121, 118, 109, 91, 78, 77, 68, 62, 61, 42, 41, 33, 23, 23, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.40950465202331543 seconds Jaccard graph constructed in 0.6320044994354248 seconds Wrote graph to binary file in 0.06148862838745117 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912087 Louvain completed 21 runs in 1.7090568542480469 seconds PhenoGraph complete in 2.8252131938934326 seconds Found communities [-1, ... 22], with sizes: [207, 426, 414, 369, 361, 265, 192, 188, 165, 119, 108, 85, 80, 68, 68, 68, 57, 52, 49, 34, 33, 33, 23, 22] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3101065158843994 seconds Jaccard graph constructed in 0.7053666114807129 seconds Wrote graph to binary file in 0.06212496757507324 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907665 After 5 runs, maximum modularity is Q = 0.908836 Louvain completed 25 runs in 2.2621588706970215 seconds PhenoGraph complete in 3.356046438217163 seconds Found communities [-1, ... 20], with sizes: [144, 649, 433, 409, 371, 252, 211, 142, 106, 100, 81, 77, 67, 63, 61, 60, 60, 44, 42, 41, 37, 36] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3102238178253174 seconds Jaccard graph constructed in 0.7130467891693115 seconds Wrote graph to binary file in 0.061510562896728516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908361 Louvain completed 21 runs in 1.685814619064331 seconds PhenoGraph complete in 2.784780263900757 seconds Found communities [-1, ... 22], with sizes: [141, 441, 423, 415, 409, 297, 262, 122, 113, 111, 105, 91, 81, 80, 71, 63, 53, 47, 46, 33, 32, 24, 15, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.414994478225708 seconds Jaccard graph constructed in 0.9955241680145264 seconds Wrote graph to binary file in 0.0627129077911377 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909846 Louvain completed 21 runs in 1.7044715881347656 seconds PhenoGraph complete in 3.199652671813965 seconds Found communities [-1, ... 20], with sizes: [167, 682, 429, 398, 290, 230, 228, 206, 109, 105, 89, 76, 72, 68, 64, 53, 51, 50, 35, 35, 25, 24] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3100249767303467 seconds Jaccard graph constructed in 0.6987600326538086 seconds Wrote graph to binary file in 0.0625908374786377 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911101 After 8 runs, maximum modularity is Q = 0.912188 Louvain completed 28 runs in 2.3771181106567383 seconds PhenoGraph complete in 3.4672839641571045 seconds Found communities [-1, ... 21], with sizes: [207, 414, 391, 384, 365, 299, 220, 175, 162, 120, 104, 84, 74, 72, 71, 69, 68, 54, 39, 33, 33, 24, 24] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3088405132293701 seconds Jaccard graph constructed in 0.7303566932678223 seconds Wrote graph to binary file in 0.061276912689208984 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908846 Louvain completed 21 runs in 1.6693217754364014 seconds PhenoGraph complete in 2.788783073425293 seconds Found communities [-1, ... 20], with sizes: [191, 475, 428, 382, 376, 280, 218, 193, 132, 115, 102, 100, 96, 75, 71, 67, 67, 34, 34, 22, 16, 12]
sc.pp.normalize_per_cell(D344_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Brus_Dis1) # log transform the data
D344_Brus_Dis1.raw = D344_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D344_Brus_Dis1 = D344_Brus_Dis1[:, D344_Brus_Dis1.var['ribo_genes']]
D344_Brus_Dis1
View of AnnData object with n_obs × n_vars = 2789 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D353_Brus_Dis1 = sc.read_10x_mtx(
'./D353_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D353_Brus_Dis1.var_names_make_unique()
D353_Brus_Dis1.obs['manip'] = 'D353_Brus_Dis1'
D353_Brus_Dis1.obs['position'] = 'Distal'
D353_Brus_Dis1.obs['method'] = 'Brushing'
D353_Brus_Dis1.obs['donor'] = 'D353'
D353_Brus_Dis1.obs['name'] = ['D353_Brus_Dis1_' + s for s in list(D353_Brus_Dis1.obs.index)]
D353_Brus_Dis1.obs_names = D353_Brus_Dis1.obs['name']
D353_Brus_Dis1
... reading from cache file ./cache/D353_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 4787 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D353_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=0)
mito_genes = D353_Brus_Dis1.var_names.str.startswith('MT-')
D353_Brus_Dis1.obs['percent_mito'] = np.sum(
D353_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.obs['n_counts'] = D353_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Dis1.to_df())
ribo_genes = D353_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Dis1.obs['percent_ribo'] = np.sum(
D353_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D353_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=500)
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['n_counts'] < 20000, :]
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 200 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D353_Brus_Dis1.X, expected_doublet_rate=0.039)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D353_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.19 Detected doublet rate = 2.4% Estimated detectable doublet fraction = 50.7% Overall doublet rate: Expected = 3.9% Estimated = 4.8% Elapsed time: 3.7 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9ce0e48>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e8828c048>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Brus_Dis1.X).predict()
D353_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0162525177001953 seconds Jaccard graph constructed in 0.8561174869537354 seconds Wrote graph to binary file in 0.3788738250732422 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906007 After 7 runs, maximum modularity is Q = 0.907108 Louvain completed 27 runs in 2.9793484210968018 seconds PhenoGraph complete in 5.250179290771484 seconds Found communities [-1, ... 20], with sizes: [202, 1070, 828, 405, 384, 362, 349, 340, 335, 285, 193, 191, 163, 116, 107, 98, 87, 86, 55, 34, 22, 19] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.025327205657959 seconds Jaccard graph constructed in 0.9080049991607666 seconds Wrote graph to binary file in 0.10102081298828125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907931 Louvain completed 21 runs in 2.151581048965454 seconds PhenoGraph complete in 4.204631567001343 seconds Found communities [-1, ... 26], with sizes: [167, 1100, 521, 426, 357, 350, 320, 312, 300, 227, 195, 191, 187, 140, 124, 115, 111, 103, 101, 79, 77, 68, 54, 31, 26, 21, 17, 11] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9135782718658447 seconds Jaccard graph constructed in 0.881655216217041 seconds Wrote graph to binary file in 0.10137200355529785 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903933 After 2 runs, maximum modularity is Q = 0.906896 After 11 runs, maximum modularity is Q = 0.908669 Louvain completed 31 runs in 3.568588972091675 seconds PhenoGraph complete in 5.4850242137908936 seconds Found communities [-1, ... 21], with sizes: [195, 1020, 678, 636, 503, 475, 375, 234, 211, 192, 178, 171, 140, 114, 111, 105, 103, 98, 73, 52, 32, 24, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8154387474060059 seconds Jaccard graph constructed in 1.1768968105316162 seconds Wrote graph to binary file in 0.09696412086486816 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906034 Louvain completed 21 runs in 2.146071195602417 seconds PhenoGraph complete in 4.2581493854522705 seconds Found communities [-1, ... 23], with sizes: [187, 1133, 735, 429, 366, 362, 348, 288, 285, 216, 170, 168, 144, 131, 122, 97, 95, 93, 86, 83, 64, 62, 32, 18, 17] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8135569095611572 seconds Jaccard graph constructed in 0.9032831192016602 seconds Wrote graph to binary file in 0.370877742767334 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907711 Louvain completed 21 runs in 2.191746473312378 seconds PhenoGraph complete in 4.298296928405762 seconds Found communities [-1, ... 25], with sizes: [162, 954, 715, 632, 483, 370, 316, 247, 227, 212, 191, 189, 174, 118, 111, 109, 97, 97, 82, 62, 58, 36, 24, 23, 18, 12, 12] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7135679721832275 seconds Jaccard graph constructed in 0.8894424438476562 seconds Wrote graph to binary file in 0.09662818908691406 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909855 Louvain completed 21 runs in 2.1778337955474854 seconds PhenoGraph complete in 3.897204875946045 seconds Found communities [-1, ... 22], with sizes: [181, 1022, 729, 463, 443, 440, 409, 306, 288, 201, 176, 127, 121, 111, 111, 97, 96, 88, 81, 75, 70, 55, 22, 19] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9132382869720459 seconds Jaccard graph constructed in 0.8571217060089111 seconds Wrote graph to binary file in 0.09652829170227051 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910012 After 3 runs, maximum modularity is Q = 0.911375 Louvain completed 23 runs in 2.597601890563965 seconds PhenoGraph complete in 4.483564138412476 seconds Found communities [-1, ... 23], with sizes: [211, 1158, 546, 513, 499, 413, 290, 273, 256, 228, 179, 167, 126, 117, 115, 100, 98, 92, 88, 84, 59, 53, 23, 22, 21] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8131711483001709 seconds Jaccard graph constructed in 1.1122410297393799 seconds Wrote graph to binary file in 0.09736990928649902 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906841 After 3 runs, maximum modularity is Q = 0.908321 Louvain completed 23 runs in 2.661264657974243 seconds PhenoGraph complete in 4.7019267082214355 seconds Found communities [-1, ... 21], with sizes: [168, 1025, 639, 588, 451, 329, 298, 276, 264, 262, 227, 199, 152, 131, 117, 102, 102, 99, 84, 80, 62, 55, 21] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9124712944030762 seconds Jaccard graph constructed in 0.8465819358825684 seconds Wrote graph to binary file in 0.33472323417663574 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907178 After 3 runs, maximum modularity is Q = 0.908195 Louvain completed 23 runs in 2.7131783962249756 seconds PhenoGraph complete in 4.826534748077393 seconds Found communities [-1, ... 24], with sizes: [198, 1076, 780, 721, 405, 334, 241, 225, 220, 206, 188, 126, 125, 125, 106, 98, 86, 83, 82, 78, 61, 53, 40, 35, 21, 18] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8127193450927734 seconds Jaccard graph constructed in 0.8970918655395508 seconds Wrote graph to binary file in 0.09639120101928711 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907681 After 2 runs, maximum modularity is Q = 0.908963 Louvain completed 22 runs in 2.432924270629883 seconds PhenoGraph complete in 4.258150815963745 seconds Found communities [-1, ... 25], with sizes: [187, 1105, 591, 578, 430, 400, 324, 289, 219, 218, 190, 177, 159, 133, 125, 108, 96, 81, 77, 63, 55, 29, 26, 21, 20, 16, 14] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0133864879608154 seconds Jaccard graph constructed in 0.882857084274292 seconds Wrote graph to binary file in 0.09488463401794434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906973 After 4 runs, maximum modularity is Q = 0.908767 Louvain completed 24 runs in 2.723487615585327 seconds PhenoGraph complete in 4.733052968978882 seconds Found communities [-1, ... 22], with sizes: [187, 1055, 543, 499, 454, 391, 345, 329, 271, 220, 194, 162, 157, 127, 120, 101, 97, 95, 93, 88, 77, 59, 45, 22] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8132424354553223 seconds Jaccard graph constructed in 0.9040405750274658 seconds Wrote graph to binary file in 0.3261268138885498 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906486 After 5 runs, maximum modularity is Q = 0.907882 Louvain completed 25 runs in 2.90863299369812 seconds PhenoGraph complete in 4.972553730010986 seconds Found communities [-1, ... 21], with sizes: [199, 1338, 666, 618, 542, 418, 257, 234, 210, 200, 145, 127, 114, 112, 94, 90, 88, 82, 76, 52, 32, 26, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7141628265380859 seconds Jaccard graph constructed in 0.8912928104400635 seconds Wrote graph to binary file in 0.09833002090454102 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909339 Louvain completed 21 runs in 2.091317892074585 seconds PhenoGraph complete in 3.8150699138641357 seconds Found communities [-1, ... 22], with sizes: [169, 1132, 590, 589, 457, 379, 340, 325, 218, 188, 148, 127, 126, 124, 116, 115, 115, 115, 99, 77, 73, 54, 32, 23] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.91365647315979 seconds Jaccard graph constructed in 1.1987249851226807 seconds Wrote graph to binary file in 0.09418463706970215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906286 After 2 runs, maximum modularity is Q = 0.908241 Louvain completed 22 runs in 2.516464948654175 seconds PhenoGraph complete in 4.742603778839111 seconds Found communities [-1, ... 23], with sizes: [166, 1060, 662, 517, 381, 358, 326, 292, 291, 211, 197, 192, 164, 146, 135, 104, 102, 100, 79, 73, 61, 54, 24, 23, 13] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7124850749969482 seconds Jaccard graph constructed in 0.8991072177886963 seconds Wrote graph to binary file in 0.3332068920135498 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909318 Louvain completed 21 runs in 2.1733670234680176 seconds PhenoGraph complete in 4.137807369232178 seconds Found communities [-1, ... 24], with sizes: [167, 1343, 761, 417, 387, 382, 252, 251, 236, 190, 169, 135, 126, 123, 114, 112, 93, 86, 81, 80, 61, 55, 38, 36, 19, 17] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8131327629089355 seconds Jaccard graph constructed in 0.8925216197967529 seconds Wrote graph to binary file in 0.09775638580322266 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909605 After 5 runs, maximum modularity is Q = 0.910888 Louvain completed 25 runs in 2.690352201461792 seconds PhenoGraph complete in 4.513716220855713 seconds Found communities [-1, ... 25], with sizes: [191, 1167, 665, 431, 369, 359, 278, 272, 248, 222, 189, 180, 171, 119, 118, 114, 95, 88, 84, 83, 80, 73, 57, 23, 20, 19, 16] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.8124220371246338 seconds Jaccard graph constructed in 0.8856863975524902 seconds Wrote graph to binary file in 0.09579825401306152 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90632 After 4 runs, maximum modularity is Q = 0.907518 Louvain completed 24 runs in 2.6654577255249023 seconds PhenoGraph complete in 4.477853536605835 seconds Found communities [-1, ... 22], with sizes: [193, 1006, 831, 446, 421, 395, 370, 322, 267, 226, 189, 170, 123, 115, 106, 99, 94, 93, 70, 64, 58, 33, 20, 20] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.817241907119751 seconds Jaccard graph constructed in 1.1534202098846436 seconds Wrote graph to binary file in 0.09583544731140137 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905742 After 3 runs, maximum modularity is Q = 0.906999 Louvain completed 23 runs in 2.583540678024292 seconds PhenoGraph complete in 4.670905590057373 seconds Found communities [-1, ... 25], with sizes: [185, 969, 537, 475, 441, 389, 334, 330, 219, 203, 202, 167, 159, 148, 140, 126, 102, 96, 94, 89, 80, 67, 59, 41, 37, 23, 19] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9173688888549805 seconds Jaccard graph constructed in 0.857006311416626 seconds Wrote graph to binary file in 0.09522342681884766 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907883 Louvain completed 21 runs in 2.2362606525421143 seconds PhenoGraph complete in 4.1246726512908936 seconds Found communities [-1, ... 20], with sizes: [214, 1014, 518, 517, 505, 503, 473, 316, 273, 220, 184, 156, 142, 112, 112, 107, 100, 92, 79, 54, 21, 19] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.714038610458374 seconds Jaccard graph constructed in 1.1092274188995361 seconds Wrote graph to binary file in 0.09666991233825684 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904754 After 4 runs, maximum modularity is Q = 0.906495 Louvain completed 24 runs in 2.6600112915039062 seconds PhenoGraph complete in 4.597906589508057 seconds Found communities [-1, ... 23], with sizes: [172, 1016, 666, 585, 525, 524, 439, 239, 227, 194, 127, 125, 124, 113, 101, 91, 79, 75, 73, 71, 54, 46, 31, 23, 11] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 1.0234904289245605 seconds Jaccard graph constructed in 0.9439792633056641 seconds Wrote graph to binary file in 0.10279488563537598 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908928 After 2 runs, maximum modularity is Q = 0.910043 Louvain completed 22 runs in 2.502131462097168 seconds PhenoGraph complete in 4.601895093917847 seconds Found communities [-1, ... 23], with sizes: [198, 1197, 566, 562, 384, 316, 312, 283, 280, 266, 195, 189, 147, 143, 116, 99, 95, 90, 81, 67, 54, 28, 22, 21, 20] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.818598747253418 seconds Jaccard graph constructed in 1.184175968170166 seconds Wrote graph to binary file in 0.0951235294342041 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905792 After 6 runs, maximum modularity is Q = 0.906813 Louvain completed 26 runs in 2.7821874618530273 seconds PhenoGraph complete in 4.901823282241821 seconds Found communities [-1, ... 24], with sizes: [165, 1066, 756, 553, 414, 341, 296, 294, 269, 172, 163, 160, 145, 133, 124, 99, 97, 92, 79, 77, 63, 54, 46, 40, 22, 11] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7158310413360596 seconds Jaccard graph constructed in 0.9046008586883545 seconds Wrote graph to binary file in 0.3708937168121338 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908037 Louvain completed 21 runs in 2.161968469619751 seconds PhenoGraph complete in 4.17364501953125 seconds Found communities [-1, ... 24], with sizes: [192, 1196, 658, 485, 350, 329, 296, 267, 257, 250, 195, 189, 171, 120, 114, 108, 104, 94, 83, 82, 68, 54, 19, 18, 18, 14] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.7137269973754883 seconds Jaccard graph constructed in 0.8944852352142334 seconds Wrote graph to binary file in 0.0983583927154541 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90756 After 4 runs, maximum modularity is Q = 0.909133 Louvain completed 24 runs in 2.8156487941741943 seconds PhenoGraph complete in 4.542718410491943 seconds Found communities [-1, ... 22], with sizes: [176, 1148, 812, 650, 334, 296, 281, 269, 260, 232, 199, 197, 136, 99, 93, 87, 86, 86, 81, 78, 54, 31, 28, 18] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.9139010906219482 seconds Jaccard graph constructed in 0.8702192306518555 seconds Wrote graph to binary file in 0.09519720077514648 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909525 Louvain completed 21 runs in 2.196678638458252 seconds PhenoGraph complete in 4.094508171081543 seconds Found communities [-1, ... 22], with sizes: [215, 1003, 797, 561, 436, 422, 369, 260, 242, 170, 169, 144, 131, 112, 107, 105, 101, 90, 81, 81, 56, 31, 28, 20]
sc.pp.normalize_per_cell(D353_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Dis1) # log transform the data
D353_Brus_Dis1.raw = D353_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D353_Brus_Dis1 = D353_Brus_Dis1[:, D353_Brus_Dis1.var['ribo_genes']]
D353_Brus_Dis1
View of AnnData object with n_obs × n_vars = 4585 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D354_Brus_Dis1 = sc.read_10x_mtx(
'./D354_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D354_Brus_Dis1.var_names_make_unique()
D354_Brus_Dis1.obs['manip'] = 'D354_Brus_Dis1'
D354_Brus_Dis1.obs['position'] = 'Distal'
D354_Brus_Dis1.obs['method'] = 'Brushing'
D354_Brus_Dis1.obs['donor'] = 'D354'
D354_Brus_Dis1.obs['name'] = ['D354_Brus_Dis1_' + s for s in list(D354_Brus_Dis1.obs.index)]
D354_Brus_Dis1.obs_names = D354_Brus_Dis1.obs['name']
D354_Brus_Dis1
... reading from cache file ./cache/D354_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2674 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D354_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=0)
mito_genes = D354_Brus_Dis1.var_names.str.startswith('MT-')
D354_Brus_Dis1.obs['percent_mito'] = np.sum(
D354_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.obs['n_counts'] = D354_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Brus_Dis1.to_df())
ribo_genes = D354_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D354_Brus_Dis1.obs['percent_ribo'] = np.sum(
D354_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D354_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=500)
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['n_counts'] < 30000, :]
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 107 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D354_Brus_Dis1.X, expected_doublet_rate=0.02)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D354_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.14 Detected doublet rate = 1.1% Estimated detectable doublet fraction = 48.4% Overall doublet rate: Expected = 2.0% Estimated = 2.3% Elapsed time: 2.1 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9dffff28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e887668>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Brus_Dis1.X).predict()
D354_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3109779357910156 seconds Jaccard graph constructed in 0.6154134273529053 seconds Wrote graph to binary file in 0.05150198936462402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912896 Louvain completed 21 runs in 1.676947832107544 seconds PhenoGraph complete in 2.6751487255096436 seconds Found communities [-1, ... 19], with sizes: [276, 574, 336, 273, 258, 253, 249, 218, 146, 117, 94, 74, 65, 59, 46, 33, 32, 28, 27, 23, 12] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31023359298706055 seconds Jaccard graph constructed in 0.6403079032897949 seconds Wrote graph to binary file in 0.055225372314453125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912606 Louvain completed 21 runs in 1.6850180625915527 seconds PhenoGraph complete in 2.7085413932800293 seconds Found communities [-1, ... 20], with sizes: [247, 464, 368, 268, 250, 246, 244, 217, 158, 133, 129, 82, 82, 79, 69, 40, 28, 26, 18, 16, 15, 14] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41544604301452637 seconds Jaccard graph constructed in 0.6332492828369141 seconds Wrote graph to binary file in 0.3448221683502197 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911734 After 12 runs, maximum modularity is Q = 0.91298 Louvain completed 32 runs in 2.6464929580688477 seconds PhenoGraph complete in 4.052936553955078 seconds Found communities [-1, ... 23], with sizes: [227, 526, 382, 317, 279, 222, 211, 184, 166, 112, 83, 78, 71, 63, 54, 53, 37, 31, 26, 14, 12, 12, 11, 11, 11] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4133937358856201 seconds Jaccard graph constructed in 0.6546018123626709 seconds Wrote graph to binary file in 0.05997109413146973 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91339 Louvain completed 21 runs in 1.6897292137145996 seconds PhenoGraph complete in 2.8313348293304443 seconds Found communities [-1, ... 21], with sizes: [263, 470, 376, 300, 249, 232, 223, 219, 146, 119, 82, 78, 68, 66, 64, 43, 43, 36, 29, 28, 28, 20, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31443095207214355 seconds Jaccard graph constructed in 0.6954479217529297 seconds Wrote graph to binary file in 0.06105375289916992 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911619 After 7 runs, maximum modularity is Q = 0.912628 Louvain completed 27 runs in 2.322207450866699 seconds PhenoGraph complete in 3.4100241661071777 seconds Found communities [-1, ... 21], with sizes: [273, 580, 357, 262, 255, 251, 215, 176, 156, 146, 84, 79, 74, 65, 51, 39, 30, 23, 21, 20, 13, 12, 11] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.308307409286499 seconds Jaccard graph constructed in 0.6391499042510986 seconds Wrote graph to binary file in 0.31018877029418945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914066 Louvain completed 21 runs in 1.696565866470337 seconds PhenoGraph complete in 2.983755350112915 seconds Found communities [-1, ... 22], with sizes: [234, 571, 316, 269, 229, 223, 189, 182, 157, 142, 122, 85, 85, 73, 68, 59, 38, 35, 30, 29, 23, 12, 11, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41469764709472656 seconds Jaccard graph constructed in 0.6524677276611328 seconds Wrote graph to binary file in 0.06051373481750488 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914361 Louvain completed 21 runs in 1.6785728931427002 seconds PhenoGraph complete in 2.823359727859497 seconds Found communities [-1, ... 22], with sizes: [246, 445, 337, 288, 277, 226, 224, 217, 173, 140, 117, 97, 95, 66, 56, 41, 28, 26, 26, 22, 13, 11, 11, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31067657470703125 seconds Jaccard graph constructed in 0.7079861164093018 seconds Wrote graph to binary file in 0.060437917709350586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911151 After 3 runs, maximum modularity is Q = 0.912428 Louvain completed 23 runs in 2.0763797760009766 seconds PhenoGraph complete in 3.1704320907592773 seconds Found communities [-1, ... 21], with sizes: [244, 346, 346, 266, 263, 234, 231, 227, 219, 167, 136, 110, 87, 66, 55, 34, 31, 30, 28, 27, 21, 14, 11] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41648435592651367 seconds Jaccard graph constructed in 0.6557509899139404 seconds Wrote graph to binary file in 0.05987143516540527 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911838 After 2 runs, maximum modularity is Q = 0.913043 Louvain completed 22 runs in 2.020399332046509 seconds PhenoGraph complete in 3.1707959175109863 seconds Found communities [-1, ... 21], with sizes: [237, 463, 305, 300, 246, 219, 206, 203, 186, 171, 158, 92, 86, 69, 64, 39, 28, 27, 24, 23, 22, 13, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3099398612976074 seconds Jaccard graph constructed in 0.638746976852417 seconds Wrote graph to binary file in 0.3167448043823242 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911895 After 9 runs, maximum modularity is Q = 0.913302 Louvain completed 29 runs in 2.5141894817352295 seconds PhenoGraph complete in 3.793105363845825 seconds Found communities [-1, ... 20], with sizes: [253, 365, 346, 317, 262, 245, 217, 206, 201, 130, 120, 109, 107, 65, 64, 49, 35, 28, 27, 24, 12, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31298279762268066 seconds Jaccard graph constructed in 0.6400563716888428 seconds Wrote graph to binary file in 0.06019878387451172 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915879 Louvain completed 21 runs in 1.6584103107452393 seconds PhenoGraph complete in 2.6892991065979004 seconds Found communities [-1, ... 21], with sizes: [249, 435, 412, 312, 241, 216, 210, 210, 137, 117, 111, 99, 90, 82, 64, 46, 45, 30, 26, 22, 14, 13, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41120338439941406 seconds Jaccard graph constructed in 0.6665787696838379 seconds Wrote graph to binary file in 0.05998063087463379 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913767 Louvain completed 21 runs in 1.6954376697540283 seconds PhenoGraph complete in 2.8483452796936035 seconds Found communities [-1, ... 22], with sizes: [248, 571, 329, 302, 267, 251, 220, 177, 152, 114, 98, 86, 67, 66, 52, 36, 28, 27, 22, 21, 16, 15, 15, 13] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3105454444885254 seconds Jaccard graph constructed in 0.6610550880432129 seconds Wrote graph to binary file in 0.0586395263671875 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910875 After 2 runs, maximum modularity is Q = 0.912356 After 11 runs, maximum modularity is Q = 0.913604 Louvain completed 31 runs in 2.885418176651001 seconds PhenoGraph complete in 3.932236909866333 seconds Found communities [-1, ... 21], with sizes: [258, 454, 359, 271, 265, 238, 212, 200, 160, 150, 142, 100, 78, 64, 64, 37, 28, 27, 23, 20, 18, 13, 12] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41141247749328613 seconds Jaccard graph constructed in 0.7076559066772461 seconds Wrote graph to binary file in 0.35921216011047363 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911854 Louvain completed 21 runs in 1.6987216472625732 seconds PhenoGraph complete in 3.1918833255767822 seconds Found communities [-1, ... 20], with sizes: [197, 594, 309, 262, 251, 216, 209, 192, 166, 159, 143, 98, 84, 82, 62, 39, 29, 29, 25, 22, 14, 11] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31340742111206055 seconds Jaccard graph constructed in 0.6480762958526611 seconds Wrote graph to binary file in 0.05925345420837402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910218 After 3 runs, maximum modularity is Q = 0.911512 Louvain completed 23 runs in 2.070939064025879 seconds PhenoGraph complete in 3.105262279510498 seconds Found communities [-1, ... 20], with sizes: [262, 493, 384, 375, 241, 221, 211, 203, 156, 133, 89, 78, 54, 48, 47, 40, 39, 28, 27, 26, 25, 13] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.318835973739624 seconds Jaccard graph constructed in 0.6472687721252441 seconds Wrote graph to binary file in 0.06056022644042969 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914123 Louvain completed 21 runs in 1.7077522277832031 seconds PhenoGraph complete in 2.7500057220458984 seconds Found communities [-1, ... 19], with sizes: [257, 502, 328, 313, 261, 226, 219, 214, 158, 131, 114, 90, 76, 71, 64, 43, 31, 29, 27, 23, 16] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.310788631439209 seconds Jaccard graph constructed in 0.6427536010742188 seconds Wrote graph to binary file in 0.05820202827453613 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911745 After 2 runs, maximum modularity is Q = 0.912848 Louvain completed 22 runs in 2.0115232467651367 seconds PhenoGraph complete in 3.0371336936950684 seconds Found communities [-1, ... 19], with sizes: [240, 574, 327, 271, 268, 254, 242, 218, 159, 154, 104, 90, 54, 49, 39, 37, 28, 28, 28, 16, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30944156646728516 seconds Jaccard graph constructed in 0.6575002670288086 seconds Wrote graph to binary file in 0.3323483467102051 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910348 After 13 runs, maximum modularity is Q = 0.91146 Louvain completed 33 runs in 2.7110776901245117 seconds PhenoGraph complete in 4.027470111846924 seconds Found communities [-1, ... 18], with sizes: [253, 592, 283, 268, 245, 238, 213, 193, 161, 147, 145, 114, 77, 67, 67, 36, 28, 25, 22, 19] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3081531524658203 seconds Jaccard graph constructed in 0.6655983924865723 seconds Wrote graph to binary file in 0.05713820457458496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911014 After 6 runs, maximum modularity is Q = 0.912216 Louvain completed 26 runs in 2.2825162410736084 seconds PhenoGraph complete in 3.334857702255249 seconds Found communities [-1, ... 21], with sizes: [231, 465, 308, 279, 220, 219, 211, 210, 146, 121, 120, 103, 98, 89, 79, 77, 65, 38, 29, 27, 26, 21, 11] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41150379180908203 seconds Jaccard graph constructed in 0.6342306137084961 seconds Wrote graph to binary file in 0.0579066276550293 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914207 Louvain completed 21 runs in 1.6732633113861084 seconds PhenoGraph complete in 2.79417085647583 seconds Found communities [-1, ... 21], with sizes: [246, 567, 318, 277, 276, 246, 241, 215, 150, 119, 93, 74, 71, 59, 47, 40, 31, 29, 23, 22, 21, 14, 14] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3146936893463135 seconds Jaccard graph constructed in 0.6446681022644043 seconds Wrote graph to binary file in 0.30008506774902344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913863 After 8 runs, maximum modularity is Q = 0.914904 Louvain completed 28 runs in 2.406613349914551 seconds PhenoGraph complete in 3.6837995052337646 seconds Found communities [-1, ... 22], with sizes: [241, 461, 329, 325, 307, 247, 223, 170, 159, 124, 88, 82, 71, 65, 64, 50, 35, 29, 27, 26, 26, 20, 13, 11] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4151759147644043 seconds Jaccard graph constructed in 0.6969523429870605 seconds Wrote graph to binary file in 0.05820322036743164 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912801 After 5 runs, maximum modularity is Q = 0.913839 Louvain completed 25 runs in 2.2126829624176025 seconds PhenoGraph complete in 3.3955702781677246 seconds Found communities [-1, ... 19], with sizes: [270, 537, 301, 278, 242, 227, 219, 209, 191, 164, 140, 82, 77, 51, 39, 34, 33, 29, 29, 28, 13] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3191845417022705 seconds Jaccard graph constructed in 0.6557085514068604 seconds Wrote graph to binary file in 0.058357954025268555 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912105 Louvain completed 21 runs in 1.6967473030090332 seconds PhenoGraph complete in 2.7497496604919434 seconds Found communities [-1, ... 19], with sizes: [200, 432, 336, 331, 289, 238, 221, 212, 169, 154, 152, 96, 72, 71, 44, 44, 37, 30, 29, 24, 12] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.41039347648620605 seconds Jaccard graph constructed in 0.6711676120758057 seconds Wrote graph to binary file in 0.058712005615234375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910091 After 4 runs, maximum modularity is Q = 0.911357 Louvain completed 24 runs in 2.1444499492645264 seconds PhenoGraph complete in 3.300370454788208 seconds Found communities [-1, ... 21], with sizes: [247, 446, 298, 247, 236, 231, 216, 189, 174, 126, 119, 110, 100, 93, 74, 66, 59, 39, 33, 29, 27, 23, 11] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3092787265777588 seconds Jaccard graph constructed in 0.6285579204559326 seconds Wrote graph to binary file in 0.33060455322265625 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911165 After 2 runs, maximum modularity is Q = 0.912317 Louvain completed 22 runs in 2.004624605178833 seconds PhenoGraph complete in 3.286855459213257 seconds Found communities [-1, ... 19], with sizes: [277, 458, 349, 311, 286, 265, 226, 202, 172, 143, 103, 87, 72, 49, 43, 37, 27, 26, 26, 21, 13]
sc.pp.normalize_per_cell(D354_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Brus_Dis1) # log transform the data
D354_Brus_Dis1.raw = D354_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D354_Brus_Dis1 = D354_Brus_Dis1[:, D354_Brus_Dis1.var['ribo_genes']]
D354_Brus_Dis1
View of AnnData object with n_obs × n_vars = 2555 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D363_Brus_Dis1 = sc.read_10x_mtx(
'./D363_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D363_Brus_Dis1.var_names_make_unique()
D363_Brus_Dis1.obs['manip'] = 'D363_Brus_Dis1'
D363_Brus_Dis1.obs['position'] = 'Distal'
D363_Brus_Dis1.obs['method'] = 'Brushing'
D363_Brus_Dis1.obs['donor'] = 'D363'
D363_Brus_Dis1.obs['name'] = ['D363_Brus_Dis1_' + s for s in list(D363_Brus_Dis1.obs.index)]
D363_Brus_Dis1.obs_names = D363_Brus_Dis1.obs['name']
D363_Brus_Dis1
... reading from cache file ./cache/D363_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1636 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D363_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=0)
mito_genes = D363_Brus_Dis1.var_names.str.startswith('MT-')
D363_Brus_Dis1.obs['percent_mito'] = np.sum(
D363_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.obs['n_counts'] = D363_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Dis1.to_df())
ribo_genes = D363_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Dis1.obs['percent_ribo'] = np.sum(
D363_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D363_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=500)
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['n_counts'] < 40000, :]
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 7 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D363_Brus_Dis1.X, expected_doublet_rate=0.014)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D363_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.15 Detected doublet rate = 0.3% Estimated detectable doublet fraction = 12.2% Overall doublet rate: Expected = 1.4% Estimated = 2.5% Elapsed time: 1.2 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9f31f7f0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e769b00>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Brus_Dis1.X).predict()
D363_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3056020736694336 seconds Jaccard graph constructed in 0.5146067142486572 seconds Wrote graph to binary file in 0.03743934631347656 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888374 Louvain completed 21 runs in 1.5843629837036133 seconds PhenoGraph complete in 2.453190565109253 seconds Found communities [-1, ... 16], with sizes: [114, 448, 325, 285, 195, 116, 69, 65, 56, 56, 56, 47, 45, 42, 38, 34, 25, 15] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3059718608856201 seconds Jaccard graph constructed in 0.5110785961151123 seconds Wrote graph to binary file in 0.03577017784118652 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88442 After 2 runs, maximum modularity is Q = 0.887818 Louvain completed 22 runs in 1.8797099590301514 seconds PhenoGraph complete in 2.7443838119506836 seconds Found communities [-1, ... 15], with sizes: [116, 427, 314, 293, 250, 121, 73, 62, 54, 53, 47, 45, 43, 38, 37, 33, 25] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30555295944213867 seconds Jaccard graph constructed in 0.5366594791412354 seconds Wrote graph to binary file in 0.03516554832458496 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.880115 Louvain completed 21 runs in 1.5799152851104736 seconds PhenoGraph complete in 2.471057176589966 seconds Found communities [-1, ... 16], with sizes: [111, 390, 354, 345, 107, 105, 104, 73, 61, 59, 53, 50, 47, 44, 38, 33, 29, 28] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30599188804626465 seconds Jaccard graph constructed in 0.5148305892944336 seconds Wrote graph to binary file in 0.03429746627807617 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88795 Louvain completed 21 runs in 1.5485806465148926 seconds PhenoGraph complete in 2.4131393432617188 seconds Found communities [-1, ... 16], with sizes: [100, 445, 328, 277, 201, 105, 72, 64, 64, 58, 49, 49, 46, 43, 40, 33, 30, 27] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30594825744628906 seconds Jaccard graph constructed in 0.505589485168457 seconds Wrote graph to binary file in 0.29527902603149414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888183 Louvain completed 21 runs in 1.5424954891204834 seconds PhenoGraph complete in 2.659623384475708 seconds Found communities [-1, ... 16], with sizes: [90, 472, 330, 247, 186, 123, 77, 62, 60, 59, 58, 54, 53, 46, 36, 34, 25, 19] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20841121673583984 seconds Jaccard graph constructed in 0.5117864608764648 seconds Wrote graph to binary file in 0.08066749572753906 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886045 After 3 runs, maximum modularity is Q = 0.887464 Louvain completed 23 runs in 1.927800178527832 seconds PhenoGraph complete in 2.7448880672454834 seconds Found communities [-1, ... 17], with sizes: [92, 370, 297, 281, 184, 106, 98, 90, 66, 61, 61, 60, 56, 51, 44, 39, 34, 27, 14] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.307281494140625 seconds Jaccard graph constructed in 0.5165157318115234 seconds Wrote graph to binary file in 0.05749392509460449 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.891585 After 13 runs, maximum modularity is Q = 0.892615 Louvain completed 33 runs in 2.569979667663574 seconds PhenoGraph complete in 3.4644887447357178 seconds Found communities [-1, ... 16], with sizes: [106, 414, 360, 308, 165, 106, 68, 62, 60, 60, 59, 53, 44, 41, 34, 33, 32, 26] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2069225311279297 seconds Jaccard graph constructed in 0.5162346363067627 seconds Wrote graph to binary file in 0.09787106513977051 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888624 Louvain completed 21 runs in 1.5862059593200684 seconds PhenoGraph complete in 2.4214892387390137 seconds Found communities [-1, ... 14], with sizes: [131, 398, 328, 328, 166, 122, 104, 88, 69, 67, 48, 46, 43, 34, 33, 26] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30587148666381836 seconds Jaccard graph constructed in 0.5218026638031006 seconds Wrote graph to binary file in 0.06596827507019043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888201 After 2 runs, maximum modularity is Q = 0.890184 Louvain completed 22 runs in 1.8673889636993408 seconds PhenoGraph complete in 2.777205228805542 seconds Found communities [-1, ... 16], with sizes: [112, 391, 377, 281, 197, 108, 78, 67, 63, 51, 50, 48, 47, 42, 34, 30, 30, 25] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20612597465515137 seconds Jaccard graph constructed in 0.5316848754882812 seconds Wrote graph to binary file in 0.056473493576049805 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884153 After 3 runs, maximum modularity is Q = 0.885227 Louvain completed 23 runs in 1.9474444389343262 seconds PhenoGraph complete in 2.774909496307373 seconds Found communities [-1, ... 17], with sizes: [105, 385, 316, 226, 198, 115, 109, 82, 62, 60, 58, 56, 55, 47, 44, 34, 32, 28, 19] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31055498123168945 seconds Jaccard graph constructed in 0.5277385711669922 seconds Wrote graph to binary file in 0.3530879020690918 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888808 Louvain completed 21 runs in 1.559537649154663 seconds PhenoGraph complete in 2.7639319896698 seconds Found communities [-1, ... 17], with sizes: [95, 403, 337, 285, 219, 116, 97, 64, 64, 63, 49, 47, 45, 37, 35, 27, 19, 15, 14] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3062112331390381 seconds Jaccard graph constructed in 0.500612735748291 seconds Wrote graph to binary file in 0.0522458553314209 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887552 After 3 runs, maximum modularity is Q = 0.889903 Louvain completed 23 runs in 1.9731624126434326 seconds PhenoGraph complete in 2.8495771884918213 seconds Found communities [-1, ... 16], with sizes: [118, 368, 341, 293, 193, 124, 107, 75, 72, 59, 49, 47, 41, 37, 36, 34, 26, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30605316162109375 seconds Jaccard graph constructed in 0.5170273780822754 seconds Wrote graph to binary file in 0.06779837608337402 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.88737 After 3 runs, maximum modularity is Q = 0.889903 Louvain completed 23 runs in 1.9486722946166992 seconds PhenoGraph complete in 2.8560667037963867 seconds Found communities [-1, ... 16], with sizes: [106, 397, 308, 299, 221, 123, 71, 67, 67, 62, 50, 49, 48, 44, 40, 34, 26, 19] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20672941207885742 seconds Jaccard graph constructed in 0.5143043994903564 seconds Wrote graph to binary file in 0.07566356658935547 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886199 After 15 runs, maximum modularity is Q = 0.887373 Louvain completed 35 runs in 2.6414635181427 seconds PhenoGraph complete in 3.4506618976593018 seconds Found communities [-1, ... 16], with sizes: [116, 469, 308, 294, 153, 79, 72, 68, 68, 61, 53, 49, 44, 44, 44, 44, 35, 30] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20579004287719727 seconds Jaccard graph constructed in 0.5308966636657715 seconds Wrote graph to binary file in 0.0879676342010498 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.895374 Louvain completed 21 runs in 1.5688138008117676 seconds PhenoGraph complete in 2.4071056842803955 seconds Found communities [-1, ... 16], with sizes: [116, 373, 358, 313, 188, 104, 72, 72, 59, 58, 55, 54, 45, 42, 36, 34, 30, 22] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30623459815979004 seconds Jaccard graph constructed in 0.5156998634338379 seconds Wrote graph to binary file in 0.04895901679992676 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884455 Louvain completed 21 runs in 1.5934596061706543 seconds PhenoGraph complete in 2.477506399154663 seconds Found communities [-1, ... 16], with sizes: [104, 356, 337, 277, 198, 137, 108, 102, 66, 65, 45, 45, 45, 45, 35, 32, 23, 11] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20683526992797852 seconds Jaccard graph constructed in 0.5088133811950684 seconds Wrote graph to binary file in 0.3363635540008545 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.884463 After 2 runs, maximum modularity is Q = 0.888948 Louvain completed 22 runs in 1.9026503562927246 seconds PhenoGraph complete in 2.968736171722412 seconds Found communities [-1, ... 16], with sizes: [120, 396, 342, 299, 195, 105, 73, 66, 60, 56, 54, 46, 45, 42, 38, 36, 30, 28] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3061048984527588 seconds Jaccard graph constructed in 0.5150551795959473 seconds Wrote graph to binary file in 0.07030296325683594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.890766 After 6 runs, maximum modularity is Q = 0.891794 Louvain completed 26 runs in 2.142488956451416 seconds PhenoGraph complete in 3.0472187995910645 seconds Found communities [-1, ... 16], with sizes: [113, 444, 339, 242, 212, 109, 80, 70, 66, 56, 51, 47, 43, 39, 37, 35, 26, 22] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3075876235961914 seconds Jaccard graph constructed in 0.5247697830200195 seconds Wrote graph to binary file in 0.07669305801391602 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.886795 After 2 runs, maximum modularity is Q = 0.89007 Louvain completed 22 runs in 1.8691158294677734 seconds PhenoGraph complete in 2.7894909381866455 seconds Found communities [-1, ... 18], with sizes: [103, 310, 301, 202, 192, 124, 117, 105, 80, 69, 67, 59, 56, 45, 43, 42, 38, 36, 25, 17] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.31122303009033203 seconds Jaccard graph constructed in 0.5190975666046143 seconds Wrote graph to binary file in 0.09315967559814453 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885109 After 2 runs, maximum modularity is Q = 0.886114 Louvain completed 22 runs in 1.8706414699554443 seconds PhenoGraph complete in 2.816376209259033 seconds Found communities [-1, ... 16], with sizes: [98, 419, 365, 292, 164, 101, 70, 68, 65, 62, 61, 44, 43, 41, 39, 37, 32, 30] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3073849678039551 seconds Jaccard graph constructed in 0.5167427062988281 seconds Wrote graph to binary file in 0.0665273666381836 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.893247 After 17 runs, maximum modularity is Q = 0.894652 Louvain completed 37 runs in 2.7665717601776123 seconds PhenoGraph complete in 3.6692793369293213 seconds Found communities [-1, ... 17], with sizes: [106, 375, 373, 292, 169, 103, 79, 74, 66, 63, 52, 50, 45, 44, 35, 35, 29, 28, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30718994140625 seconds Jaccard graph constructed in 0.5227699279785156 seconds Wrote graph to binary file in 0.06388568878173828 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887266 Louvain completed 21 runs in 1.5775952339172363 seconds PhenoGraph complete in 2.4874119758605957 seconds Found communities [-1, ... 15], with sizes: [97, 356, 351, 332, 187, 123, 109, 80, 69, 69, 49, 45, 44, 34, 30, 29, 27] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20546865463256836 seconds Jaccard graph constructed in 0.5173823833465576 seconds Wrote graph to binary file in 0.36354827880859375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.885597 After 2 runs, maximum modularity is Q = 0.888471 Louvain completed 22 runs in 1.8766109943389893 seconds PhenoGraph complete in 2.982496738433838 seconds Found communities [-1, ... 16], with sizes: [107, 415, 295, 289, 192, 124, 114, 64, 63, 59, 51, 48, 45, 43, 36, 30, 30, 26] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20569634437561035 seconds Jaccard graph constructed in 0.5270733833312988 seconds Wrote graph to binary file in 0.05930829048156738 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.887685 Louvain completed 21 runs in 1.551440715789795 seconds PhenoGraph complete in 2.3558590412139893 seconds Found communities [-1, ... 15], with sizes: [120, 405, 292, 197, 187, 149, 122, 99, 78, 63, 59, 58, 53, 45, 42, 36, 26] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3061830997467041 seconds Jaccard graph constructed in 0.5073649883270264 seconds Wrote graph to binary file in 0.09500670433044434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.888464 Louvain completed 21 runs in 1.556274652481079 seconds PhenoGraph complete in 2.4863333702087402 seconds Found communities [-1, ... 16], with sizes: [90, 393, 337, 297, 192, 114, 88, 70, 65, 57, 55, 53, 52, 46, 45, 35, 26, 16]
sc.pp.normalize_per_cell(D363_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Dis1) # log transform the data
D363_Brus_Dis1.raw = D363_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D363_Brus_Dis1 = D363_Brus_Dis1[:, D363_Brus_Dis1.var['ribo_genes']]
D363_Brus_Dis1
View of AnnData object with n_obs × n_vars = 1625 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D367_Brus_Dis1 = sc.read_10x_mtx(
'./D367_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D367_Brus_Dis1.var_names_make_unique()
D367_Brus_Dis1.obs['manip'] = 'D367_Brus_Dis1'
D367_Brus_Dis1.obs['position'] = 'Distal'
D367_Brus_Dis1.obs['method'] = 'Brushing'
D367_Brus_Dis1.obs['donor'] = 'D367'
D367_Brus_Dis1.obs['name'] = ['D367_Brus_Dis1_' + s for s in list(D367_Brus_Dis1.obs.index)]
D367_Brus_Dis1.obs_names = D367_Brus_Dis1.obs['name']
D367_Brus_Dis1
... reading from cache file ./cache/D367_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 2192 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D367_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=0)
mito_genes = D367_Brus_Dis1.var_names.str.startswith('MT-')
D367_Brus_Dis1.obs['percent_mito'] = np.sum(
D367_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.obs['n_counts'] = D367_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Dis1.to_df())
ribo_genes = D367_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Dis1.obs['percent_ribo'] = np.sum(
D367_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D367_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=500)
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['n_counts'] < 25000, :]
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 63 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D367_Brus_Dis1.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D367_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.20 Detected doublet rate = 0.5% Estimated detectable doublet fraction = 34.2% Overall doublet rate: Expected = 1.8% Estimated = 1.4% Elapsed time: 1.6 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b48e48>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e966080>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Brus_Dis1.X).predict()
D367_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30817437171936035 seconds Jaccard graph constructed in 0.5569641590118408 seconds Wrote graph to binary file in 0.05292320251464844 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.915315 Louvain completed 21 runs in 1.6288034915924072 seconds PhenoGraph complete in 2.5775041580200195 seconds Found communities [-1, ... 22], with sizes: [210, 389, 372, 142, 138, 129, 123, 121, 106, 96, 94, 90, 88, 86, 85, 78, 78, 62, 61, 30, 24, 23, 14, 11] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30838942527770996 seconds Jaccard graph constructed in 0.6199069023132324 seconds Wrote graph to binary file in 0.3374314308166504 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913035 Louvain completed 21 runs in 1.648174524307251 seconds PhenoGraph complete in 2.926996946334839 seconds Found communities [-1, ... 23], with sizes: [213, 374, 352, 188, 139, 132, 124, 123, 122, 117, 112, 91, 85, 82, 69, 60, 54, 51, 34, 33, 27, 27, 16, 13, 12] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30765342712402344 seconds Jaccard graph constructed in 0.6194920539855957 seconds Wrote graph to binary file in 0.04731869697570801 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913135 After 8 runs, maximum modularity is Q = 0.914167 Louvain completed 28 runs in 2.3672173023223877 seconds PhenoGraph complete in 3.3543691635131836 seconds Found communities [-1, ... 22], with sizes: [246, 368, 335, 169, 162, 138, 123, 123, 121, 119, 119, 96, 80, 75, 69, 64, 46, 44, 41, 29, 29, 22, 20, 12] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30759096145629883 seconds Jaccard graph constructed in 0.6535928249359131 seconds Wrote graph to binary file in 0.0500178337097168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910542 Louvain completed 21 runs in 1.6546294689178467 seconds PhenoGraph complete in 2.683774709701538 seconds Found communities [-1, ... 22], with sizes: [224, 438, 345, 222, 150, 135, 126, 124, 122, 94, 92, 82, 74, 61, 58, 53, 53, 50, 45, 35, 21, 18, 16, 12] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30747389793395996 seconds Jaccard graph constructed in 0.632627010345459 seconds Wrote graph to binary file in 0.04820871353149414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909638 After 4 runs, maximum modularity is Q = 0.910691 Louvain completed 24 runs in 2.1220600605010986 seconds PhenoGraph complete in 3.1250510215759277 seconds Found communities [-1, ... 22], with sizes: [206, 343, 330, 169, 154, 138, 123, 120, 119, 119, 117, 91, 84, 77, 77, 65, 64, 59, 57, 47, 29, 27, 22, 13] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3113377094268799 seconds Jaccard graph constructed in 0.6209285259246826 seconds Wrote graph to binary file in 0.3390324115753174 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911996 Louvain completed 21 runs in 1.6507577896118164 seconds PhenoGraph complete in 2.938380002975464 seconds Found communities [-1, ... 21], with sizes: [203, 407, 338, 154, 141, 133, 121, 121, 116, 106, 103, 92, 87, 86, 84, 84, 75, 58, 48, 32, 27, 23, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3106231689453125 seconds Jaccard graph constructed in 0.6192572116851807 seconds Wrote graph to binary file in 0.0482940673828125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.914419 After 3 runs, maximum modularity is Q = 0.91564 Louvain completed 23 runs in 2.0288586616516113 seconds PhenoGraph complete in 3.0233969688415527 seconds Found communities [-1, ... 22], with sizes: [229, 399, 383, 145, 139, 123, 121, 118, 99, 97, 94, 90, 82, 81, 79, 77, 73, 69, 50, 32, 24, 19, 16, 11] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30785131454467773 seconds Jaccard graph constructed in 0.6327290534973145 seconds Wrote graph to binary file in 0.047777652740478516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911586 Louvain completed 21 runs in 1.631681203842163 seconds PhenoGraph complete in 2.632725954055786 seconds Found communities [-1, ... 20], with sizes: [220, 432, 326, 140, 134, 131, 130, 128, 126, 125, 125, 111, 105, 77, 70, 68, 67, 43, 28, 27, 24, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30881333351135254 seconds Jaccard graph constructed in 0.6662120819091797 seconds Wrote graph to binary file in 0.04795646667480469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.916893 Louvain completed 21 runs in 1.6270017623901367 seconds PhenoGraph complete in 2.667825937271118 seconds Found communities [-1, ... 21], with sizes: [229, 394, 347, 217, 170, 132, 130, 120, 116, 104, 93, 85, 74, 71, 68, 66, 61, 51, 37, 36, 23, 15, 11] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30783963203430176 seconds Jaccard graph constructed in 0.6570472717285156 seconds Wrote graph to binary file in 0.047704219818115234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.913946 Louvain completed 21 runs in 1.6545851230621338 seconds PhenoGraph complete in 2.6834912300109863 seconds Found communities [-1, ... 22], with sizes: [202, 387, 345, 146, 133, 131, 127, 119, 119, 103, 91, 90, 90, 83, 83, 81, 75, 65, 59, 31, 27, 25, 22, 16] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30813074111938477 seconds Jaccard graph constructed in 0.6231639385223389 seconds Wrote graph to binary file in 0.3321681022644043 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911367 After 3 runs, maximum modularity is Q = 0.912621 Louvain completed 23 runs in 2.0352542400360107 seconds PhenoGraph complete in 3.3125600814819336 seconds Found communities [-1, ... 20], with sizes: [218, 412, 390, 192, 136, 128, 125, 124, 120, 105, 91, 90, 85, 85, 80, 68, 63, 43, 32, 26, 22, 15] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30970287322998047 seconds Jaccard graph constructed in 0.6314880847930908 seconds Wrote graph to binary file in 0.04741311073303223 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909782 Louvain completed 21 runs in 1.6635627746582031 seconds PhenoGraph complete in 2.6692705154418945 seconds Found communities [-1, ... 21], with sizes: [197, 433, 267, 236, 138, 136, 121, 121, 121, 99, 97, 95, 94, 78, 76, 71, 70, 56, 52, 32, 27, 22, 11] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30899667739868164 seconds Jaccard graph constructed in 0.6445717811584473 seconds Wrote graph to binary file in 0.046227216720581055 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.907196 Louvain completed 21 runs in 1.653536319732666 seconds PhenoGraph complete in 2.6658196449279785 seconds Found communities [-1, ... 20], with sizes: [216, 374, 358, 179, 170, 137, 132, 128, 127, 117, 115, 112, 85, 80, 68, 63, 50, 36, 35, 25, 24, 19] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2072758674621582 seconds Jaccard graph constructed in 0.6239023208618164 seconds Wrote graph to binary file in 0.04873156547546387 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911307 After 5 runs, maximum modularity is Q = 0.912324 Louvain completed 25 runs in 2.1757593154907227 seconds PhenoGraph complete in 3.068892240524292 seconds Found communities [-1, ... 22], with sizes: [248, 393, 372, 164, 139, 128, 126, 115, 114, 101, 98, 94, 77, 76, 66, 65, 61, 48, 41, 33, 30, 26, 23, 12] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3086273670196533 seconds Jaccard graph constructed in 0.6524572372436523 seconds Wrote graph to binary file in 0.04671001434326172 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911427 Louvain completed 21 runs in 1.7503738403320312 seconds PhenoGraph complete in 2.7742316722869873 seconds Found communities [-1, ... 21], with sizes: [246, 420, 396, 154, 135, 130, 126, 126, 119, 99, 95, 84, 83, 79, 69, 67, 62, 47, 30, 25, 23, 19, 16] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30905914306640625 seconds Jaccard graph constructed in 0.6204829216003418 seconds Wrote graph to binary file in 0.332430362701416 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909859 Louvain completed 21 runs in 1.6337625980377197 seconds PhenoGraph complete in 2.914146900177002 seconds Found communities [-1, ... 25], with sizes: [209, 397, 264, 227, 134, 131, 125, 124, 118, 105, 95, 88, 79, 76, 74, 72, 55, 55, 42, 34, 25, 25, 24, 20, 20, 17, 15] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3070664405822754 seconds Jaccard graph constructed in 0.6194899082183838 seconds Wrote graph to binary file in 0.045914411544799805 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910871 Louvain completed 21 runs in 1.6572914123535156 seconds PhenoGraph complete in 2.646458864212036 seconds Found communities [-1, ... 25], with sizes: [189, 390, 272, 148, 140, 118, 114, 113, 111, 110, 108, 97, 95, 91, 85, 81, 78, 59, 53, 40, 36, 27, 26, 20, 19, 17, 13] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30765438079833984 seconds Jaccard graph constructed in 0.615267276763916 seconds Wrote graph to binary file in 0.04660391807556152 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.908977 After 7 runs, maximum modularity is Q = 0.910094 Louvain completed 27 runs in 2.3271610736846924 seconds PhenoGraph complete in 3.310150384902954 seconds Found communities [-1, ... 22], with sizes: [203, 425, 358, 150, 131, 130, 120, 113, 111, 106, 94, 88, 85, 84, 79, 77, 70, 65, 39, 37, 29, 23, 18, 15] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30757808685302734 seconds Jaccard graph constructed in 0.6173884868621826 seconds Wrote graph to binary file in 0.04760885238647461 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912536 Louvain completed 21 runs in 1.622739315032959 seconds PhenoGraph complete in 2.6111230850219727 seconds Found communities [-1, ... 21], with sizes: [201, 354, 316, 145, 141, 133, 128, 126, 123, 121, 120, 120, 102, 91, 78, 78, 71, 48, 43, 35, 28, 24, 24] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30762290954589844 seconds Jaccard graph constructed in 0.6224753856658936 seconds Wrote graph to binary file in 0.04771590232849121 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912153 Louvain completed 21 runs in 1.6539711952209473 seconds PhenoGraph complete in 2.6452627182006836 seconds Found communities [-1, ... 21], with sizes: [206, 411, 370, 203, 157, 145, 123, 123, 106, 99, 93, 90, 83, 79, 73, 68, 66, 59, 23, 23, 21, 15, 14] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30814456939697266 seconds Jaccard graph constructed in 0.6318809986114502 seconds Wrote graph to binary file in 0.3320579528808594 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.91045 Louvain completed 21 runs in 1.6589550971984863 seconds PhenoGraph complete in 2.944554328918457 seconds Found communities [-1, ... 20], with sizes: [220, 415, 348, 142, 132, 130, 128, 121, 116, 113, 106, 104, 94, 88, 81, 67, 62, 54, 48, 40, 23, 18] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3077964782714844 seconds Jaccard graph constructed in 0.6215500831604004 seconds Wrote graph to binary file in 0.046036720275878906 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911735 After 2 runs, maximum modularity is Q = 0.912774 Louvain completed 22 runs in 1.9951071739196777 seconds PhenoGraph complete in 2.9879136085510254 seconds Found communities [-1, ... 22], with sizes: [223, 411, 330, 145, 140, 131, 129, 127, 124, 122, 121, 102, 101, 86, 85, 69, 45, 28, 28, 26, 25, 22, 18, 12] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30730152130126953 seconds Jaccard graph constructed in 0.6207826137542725 seconds Wrote graph to binary file in 0.04430031776428223 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.911311 Louvain completed 21 runs in 1.6737017631530762 seconds PhenoGraph complete in 2.658686637878418 seconds Found communities [-1, ... 22], with sizes: [236, 373, 298, 173, 147, 136, 136, 127, 127, 127, 103, 102, 88, 83, 73, 64, 64, 43, 38, 30, 25, 23, 18, 16] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30754995346069336 seconds Jaccard graph constructed in 0.619056224822998 seconds Wrote graph to binary file in 0.04679417610168457 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.912466 Louvain completed 21 runs in 1.640221357345581 seconds PhenoGraph complete in 2.6274075508117676 seconds Found communities [-1, ... 21], with sizes: [224, 343, 323, 141, 133, 129, 126, 122, 119, 117, 112, 104, 90, 86, 84, 82, 81, 67, 52, 43, 31, 22, 19] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3131594657897949 seconds Jaccard graph constructed in 0.6099262237548828 seconds Wrote graph to binary file in 0.04532980918884277 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910656 Louvain completed 21 runs in 1.6737513542175293 seconds PhenoGraph complete in 2.654820680618286 seconds Found communities [-1, ... 23], with sizes: [186, 371, 345, 140, 136, 128, 127, 123, 122, 118, 115, 104, 91, 91, 87, 79, 74, 49, 33, 33, 30, 21, 20, 16, 11]
sc.pp.normalize_per_cell(D367_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Dis1) # log transform the data
D367_Brus_Dis1.raw = D367_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D367_Brus_Dis1 = D367_Brus_Dis1[:, D367_Brus_Dis1.var['ribo_genes']]
D367_Brus_Dis1
View of AnnData object with n_obs × n_vars = 2120 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
D372_Brus_Dis1 = sc.read_10x_mtx(
'./D372_Brus_Dis1/' + outsPath,
var_names='gene_symbols',
cache=True)
D372_Brus_Dis1.var_names_make_unique()
D372_Brus_Dis1.obs['manip'] = 'D372_Brus_Dis1'
D372_Brus_Dis1.obs['position'] = 'Distal'
D372_Brus_Dis1.obs['method'] = 'Brushing'
D372_Brus_Dis1.obs['donor'] = 'D372'
D372_Brus_Dis1.obs['name'] = ['D372_Brus_Dis1_' + s for s in list(D372_Brus_Dis1.obs.index)]
D372_Brus_Dis1.obs_names = D372_Brus_Dis1.obs['name']
D372_Brus_Dis1
... reading from cache file ./cache/D372_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
AnnData object with n_obs × n_vars = 1755 × 32739
obs: 'manip', 'position', 'method', 'donor', 'name'
var: 'gene_ids'
sc.pl.highest_expr_genes(D372_Brus_Dis1, n_top=20)
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=0)
mito_genes = D372_Brus_Dis1.var_names.str.startswith('MT-')
D372_Brus_Dis1.obs['percent_mito'] = np.sum(
D372_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.obs['n_counts'] = D372_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Dis1.to_df())
ribo_genes = D372_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Dis1.obs['percent_ribo'] = np.sum(
D372_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]
sc.pl.violin(D372_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
jitter=0.4, multi_panel=True)
... storing 'manip' as categorical ... storing 'position' as categorical ... storing 'method' as categorical ... storing 'donor' as categorical
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=500)
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['n_counts'] < 30000, :]
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 19 cells that have less than 500 genes expressed
# scrublet
scrub = scr.Scrublet(D372_Brus_Dis1.X, expected_doublet_rate=0.014)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D372_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.15 Detected doublet rate = 0.3% Estimated detectable doublet fraction = 32.5% Overall doublet rate: Expected = 1.4% Estimated = 0.9% Elapsed time: 1.3 seconds
(<Figure size 640x240 with 2 Axes>,
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e45b390>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e296240>],
dtype=object))
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Brus_Dis1.X).predict()
D372_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
warnings.warn("Sparse raw_counts is automatically densified.")
Iteration 1/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20571494102478027 seconds Jaccard graph constructed in 0.5139338970184326 seconds Wrote graph to binary file in 0.04161214828491211 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.902631 Louvain completed 21 runs in 1.607027530670166 seconds PhenoGraph complete in 2.402970790863037 seconds Found communities [-1, ... 18], with sizes: [151, 517, 267, 180, 134, 120, 113, 95, 90, 79, 62, 57, 55, 47, 46, 44, 32, 21, 19, 14] Iteration 2/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3072233200073242 seconds Jaccard graph constructed in 0.5509219169616699 seconds Wrote graph to binary file in 0.03649640083312988 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906597 Louvain completed 21 runs in 1.6010956764221191 seconds PhenoGraph complete in 2.507722854614258 seconds Found communities [-1, ... 18], with sizes: [174, 538, 248, 180, 132, 130, 124, 89, 83, 73, 71, 52, 46, 45, 40, 36, 29, 21, 18, 14] Iteration 3/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20583438873291016 seconds Jaccard graph constructed in 0.524289608001709 seconds Wrote graph to binary file in 0.07836651802062988 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.899191 After 3 runs, maximum modularity is Q = 0.900269 Louvain completed 23 runs in 1.9868910312652588 seconds PhenoGraph complete in 2.8087825775146484 seconds Found communities [-1, ... 19], with sizes: [189, 527, 210, 178, 177, 155, 118, 91, 85, 56, 54, 53, 46, 41, 40, 29, 25, 22, 20, 14, 13] Iteration 4/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30797839164733887 seconds Jaccard graph constructed in 0.5188636779785156 seconds Wrote graph to binary file in 0.042851924896240234 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905209 Louvain completed 21 runs in 1.6132774353027344 seconds PhenoGraph complete in 2.5005009174346924 seconds Found communities [-1, ... 18], with sizes: [126, 468, 313, 176, 158, 134, 123, 117, 111, 83, 56, 49, 48, 43, 42, 38, 19, 16, 12, 11] Iteration 5/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20615077018737793 seconds Jaccard graph constructed in 0.5189366340637207 seconds Wrote graph to binary file in 0.38534021377563477 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905226 Louvain completed 21 runs in 1.6292517185211182 seconds PhenoGraph complete in 2.751446008682251 seconds Found communities [-1, ... 18], with sizes: [154, 466, 341, 180, 128, 127, 120, 93, 93, 82, 63, 56, 49, 41, 41, 28, 26, 22, 19, 14] Iteration 6/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20828032493591309 seconds Jaccard graph constructed in 0.619469165802002 seconds Wrote graph to binary file in 0.041007041931152344 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904828 After 3 runs, maximum modularity is Q = 0.906637 Louvain completed 23 runs in 1.9833974838256836 seconds PhenoGraph complete in 2.8635506629943848 seconds Found communities [-1, ... 19], with sizes: [131, 430, 266, 183, 139, 117, 114, 111, 101, 86, 85, 59, 53, 46, 44, 44, 43, 38, 22, 20, 11] Iteration 7/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2070465087890625 seconds Jaccard graph constructed in 0.6272745132446289 seconds Wrote graph to binary file in 0.03924131393432617 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904488 Louvain completed 21 runs in 1.5961191654205322 seconds PhenoGraph complete in 2.481203556060791 seconds Found communities [-1, ... 20], with sizes: [168, 374, 256, 192, 184, 139, 138, 110, 83, 77, 76, 57, 55, 43, 42, 36, 31, 21, 19, 15, 14, 13] Iteration 8/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20821666717529297 seconds Jaccard graph constructed in 0.6178300380706787 seconds Wrote graph to binary file in 0.0373225212097168 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90347 Louvain completed 21 runs in 1.6073861122131348 seconds PhenoGraph complete in 2.482001543045044 seconds Found communities [-1, ... 17], with sizes: [184, 470, 316, 174, 155, 136, 126, 112, 72, 62, 57, 46, 45, 42, 42, 35, 34, 22, 13] Iteration 9/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2064056396484375 seconds Jaccard graph constructed in 0.5516624450683594 seconds Wrote graph to binary file in 0.06741881370544434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.910551 Louvain completed 21 runs in 1.5890874862670898 seconds PhenoGraph complete in 2.4282758235931396 seconds Found communities [-1, ... 17], with sizes: [179, 466, 301, 180, 139, 115, 109, 109, 103, 96, 55, 50, 49, 45, 43, 42, 31, 19, 12] Iteration 10/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.21695446968078613 seconds Jaccard graph constructed in 0.5578131675720215 seconds Wrote graph to binary file in 0.07282543182373047 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904749 Louvain completed 21 runs in 1.602863073348999 seconds PhenoGraph complete in 2.4634809494018555 seconds Found communities [-1, ... 19], with sizes: [175, 504, 299, 179, 136, 126, 111, 106, 105, 70, 55, 54, 40, 35, 34, 33, 28, 17, 13, 12, 11] Iteration 11/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20670032501220703 seconds Jaccard graph constructed in 0.5516238212585449 seconds Wrote graph to binary file in 0.051191091537475586 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904208 Louvain completed 21 runs in 1.6287384033203125 seconds PhenoGraph complete in 2.4512691497802734 seconds Found communities [-1, ... 19], with sizes: [178, 367, 283, 181, 145, 140, 136, 122, 113, 108, 72, 55, 45, 41, 40, 29, 28, 19, 15, 14, 12] Iteration 12/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20838570594787598 seconds Jaccard graph constructed in 0.6194431781768799 seconds Wrote graph to binary file in 0.32436418533325195 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.901203 After 5 runs, maximum modularity is Q = 0.903511 Louvain completed 25 runs in 2.1236932277679443 seconds PhenoGraph complete in 3.292933225631714 seconds Found communities [-1, ... 18], with sizes: [176, 456, 323, 177, 171, 122, 112, 107, 98, 85, 55, 43, 42, 42, 37, 30, 19, 17, 17, 14] Iteration 13/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20653152465820312 seconds Jaccard graph constructed in 0.5965385437011719 seconds Wrote graph to binary file in 0.04018712043762207 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904997 Louvain completed 21 runs in 1.5950124263763428 seconds PhenoGraph complete in 2.456677198410034 seconds Found communities [-1, ... 19], with sizes: [210, 506, 304, 176, 116, 109, 104, 97, 95, 86, 63, 55, 41, 38, 37, 31, 20, 19, 13, 12, 11] Iteration 14/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20726919174194336 seconds Jaccard graph constructed in 0.539621114730835 seconds Wrote graph to binary file in 0.050312042236328125 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904782 After 5 runs, maximum modularity is Q = 0.905838 Louvain completed 25 runs in 2.1546876430511475 seconds PhenoGraph complete in 2.965471029281616 seconds Found communities [-1, ... 20], with sizes: [157, 496, 363, 177, 114, 110, 110, 97, 70, 67, 51, 43, 43, 42, 37, 33, 33, 32, 20, 18, 16, 14] Iteration 15/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20701122283935547 seconds Jaccard graph constructed in 0.5227954387664795 seconds Wrote graph to binary file in 0.05220174789428711 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.9051 After 2 runs, maximum modularity is Q = 0.90653 Louvain completed 22 runs in 1.9522175788879395 seconds PhenoGraph complete in 2.766913890838623 seconds Found communities [-1, ... 17], with sizes: [185, 465, 443, 184, 114, 99, 94, 83, 81, 67, 63, 54, 51, 40, 35, 28, 23, 22, 12] Iteration 16/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20703864097595215 seconds Jaccard graph constructed in 0.5422213077545166 seconds Wrote graph to binary file in 0.05976581573486328 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.903313 Louvain completed 21 runs in 1.6176977157592773 seconds PhenoGraph complete in 2.439768075942993 seconds Found communities [-1, ... 18], with sizes: [236, 430, 325, 182, 118, 118, 111, 109, 100, 85, 54, 53, 43, 39, 31, 30, 24, 23, 18, 14] Iteration 17/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20911455154418945 seconds Jaccard graph constructed in 0.5380370616912842 seconds Wrote graph to binary file in 0.0778660774230957 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904103 After 3 runs, maximum modularity is Q = 0.906238 Louvain completed 23 runs in 2.006376266479492 seconds PhenoGraph complete in 2.848581314086914 seconds Found communities [-1, ... 17], with sizes: [195, 419, 322, 176, 132, 118, 108, 107, 107, 101, 59, 57, 55, 49, 42, 35, 30, 20, 11] Iteration 18/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20604419708251953 seconds Jaccard graph constructed in 0.5328047275543213 seconds Wrote graph to binary file in 0.36903810501098633 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906027 After 2 runs, maximum modularity is Q = 0.908249 Louvain completed 22 runs in 1.9443886280059814 seconds PhenoGraph complete in 3.064648151397705 seconds Found communities [-1, ... 19], with sizes: [181, 422, 339, 181, 127, 108, 100, 99, 87, 87, 83, 64, 59, 47, 42, 29, 29, 19, 15, 14, 11] Iteration 19/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20657038688659668 seconds Jaccard graph constructed in 0.608447790145874 seconds Wrote graph to binary file in 0.037868499755859375 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.909134 Louvain completed 21 runs in 1.6105339527130127 seconds PhenoGraph complete in 2.4751977920532227 seconds Found communities [-1, ... 18], with sizes: [168, 495, 303, 181, 134, 114, 109, 100, 75, 72, 55, 53, 53, 48, 47, 42, 37, 25, 18, 14] Iteration 20/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20666742324829102 seconds Jaccard graph constructed in 0.5423116683959961 seconds Wrote graph to binary file in 0.05187082290649414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.900571 After 8 runs, maximum modularity is Q = 0.901774 Louvain completed 28 runs in 2.328256130218506 seconds PhenoGraph complete in 3.1406610012054443 seconds Found communities [-1, ... 16], with sizes: [165, 467, 333, 177, 138, 127, 117, 108, 107, 99, 60, 56, 50, 38, 36, 31, 21, 13] Iteration 21/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20690321922302246 seconds Jaccard graph constructed in 0.5393500328063965 seconds Wrote graph to binary file in 0.05608105659484863 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.904099 After 2 runs, maximum modularity is Q = 0.905887 After 9 runs, maximum modularity is Q = 0.906971 Louvain completed 29 runs in 2.634321689605713 seconds PhenoGraph complete in 3.4601807594299316 seconds Found communities [-1, ... 18], with sizes: [174, 484, 243, 176, 156, 148, 119, 88, 87, 72, 65, 59, 53, 51, 43, 39, 28, 23, 22, 13] Iteration 22/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20659542083740234 seconds Jaccard graph constructed in 0.5471713542938232 seconds Wrote graph to binary file in 0.04985666275024414 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906413 Louvain completed 21 runs in 1.6360414028167725 seconds PhenoGraph complete in 2.4550068378448486 seconds Found communities [-1, ... 18], with sizes: [160, 471, 235, 231, 179, 114, 106, 95, 83, 80, 67, 57, 54, 43, 42, 37, 32, 27, 18, 12] Iteration 23/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20646929740905762 seconds Jaccard graph constructed in 0.5364046096801758 seconds Wrote graph to binary file in 0.05254793167114258 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.90219 After 2 runs, maximum modularity is Q = 0.903544 Louvain completed 22 runs in 1.9394330978393555 seconds PhenoGraph complete in 2.764660120010376 seconds Found communities [-1, ... 19], with sizes: [159, 354, 326, 177, 138, 133, 122, 115, 109, 83, 81, 55, 54, 42, 35, 32, 32, 31, 25, 20, 20] Iteration 24/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20595860481262207 seconds Jaccard graph constructed in 0.5301704406738281 seconds Wrote graph to binary file in 0.3261222839355469 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.905169 Louvain completed 21 runs in 1.58907151222229 seconds PhenoGraph complete in 2.6655876636505127 seconds Found communities [-1, ... 17], with sizes: [173, 519, 286, 178, 131, 122, 121, 111, 96, 84, 55, 55, 48, 45, 31, 30, 28, 17, 13] Iteration 25/25 Creating synthetic doublets... Normalizing... Running PCA... Clustering augmented data set with Phenograph... Setting directed=False because prune=True Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.20603418350219727 seconds Jaccard graph constructed in 0.5456793308258057 seconds Wrote graph to binary file in 0.05715012550354004 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.906646 Louvain completed 21 runs in 1.6018450260162354 seconds PhenoGraph complete in 2.42425274848938 seconds Found communities [-1, ... 17], with sizes: [155, 531, 272, 179, 130, 119, 118, 105, 92, 81, 70, 52, 52, 44, 41, 40, 25, 21, 16]
sc.pp.normalize_per_cell(D372_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Dis1) # log transform the data
D372_Brus_Dis1.raw = D372_Brus_Dis1 # freeze the object (for later use of the raw state of it)
D372_Brus_Dis1 = D372_Brus_Dis1[:, D372_Brus_Dis1.var['ribo_genes']]
D372_Brus_Dis1
View of AnnData object with n_obs × n_vars = 1715 × 32568
obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
var: 'gene_ids', 'ribo_genes'
adata = D322_Biop_Nas1.concatenate(D322_Biop_Pro1, D322_Biop_Int1,
D326_Biop_Pro1, D326_Biop_Int1, D326_Brus_Dis1,
D337_Brus_Dis1,
D339_Biop_Nas1, D339_Biop_Pro1, D339_Biop_Int1, D339_Brus_Dis1,
D344_Biop_Nas1, D344_Biop_Pro1, D344_Biop_Int1, D344_Brus_Dis1,
D345_Biop_Nas1,
D353_Brus_Nas1, D353_Biop_Pro1, D353_Biop_Int2, D353_Brus_Dis1,
D354_Biop_Pro1, D354_Biop_Int2, D354_Brus_Dis1,
D363_Brus_Nas1, D363_Biop_Pro1, D363_Biop_Int2, D363_Brus_Dis1,
D367_Brus_Nas1, D367_Biop_Pro1, D367_Biop_Int1, D367_Brus_Dis1,
D372_Brus_Nas1, D372_Biop_Pro1, D372_Biop_Int1, D372_Biop_Int2, D372_Brus_Dis1,
join='inner')
adata.write('/Data/Preprocessed_doublet_dataset.h5ad')
... storing 'donor' as categorical ... storing 'manip' as categorical ... storing 'method' as categorical ... storing 'position' as categorical
adata.obs.to_csv(path_or_buf = '/Data/metadata_doublet.tsv',
sep = '\t')